shithub: libvpx

--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h

+++ /dev/null

@@ -1,59 +1,0 @@

-/*

- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_

-#define VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_

-#include <assert.h>

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-#include "vpx_dsp/mips/common_dspr2.h"

-#ifdef __cplusplus

-extern "C" {

-#endif

-#if HAVE_DSPR2

-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \

-                                                                               \

-  int32_t tmp, out;                                                            \

-  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \

-  int     in = input;                                                          \

-                                                                               \

-  __asm__ __volatile__ (                                                       \

-      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \

-      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\

-      "mthi     $zero,                  $ac1                              \n\t"\

-      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\

-      "extp     %[tmp],                 $ac1,             31              \n\t"\

-                                                                               \

-      /* out = dct_const_round_shift(out * cospi_16_64); */                    \

-      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\

-      "mthi     $zero,                  $ac2                              \n\t"\

-      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\

-      "extp     %[out],                 $ac2,             31              \n\t"\

-                                                                               \

-      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \

-      : [in] "r" (in),                                                         \

-        [dct_cost_rounding] "r" (dct_cost_rounding),                           \

-        [cospi_16_64] "r" (cospi_16_64)                                        \

-   );                                                                          \

-  out;                                                                    })

-void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

-                                   int dest_stride);

-#endif  // #if HAVE_DSPR2

-#ifdef __cplusplus

-}  // extern "C"

-#endif

-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_

--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

@@ -16,1074 +16,11 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_idct.h"

-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

 #include "vpx_dsp/txfm_common.h"

 #include "vpx_ports/mem.h"

 #if HAVE_DSPR2

-static void idct16_rows_dspr2(const int16_t *input, int16_t *output,

-                              uint32_t no_rows) {

-  int i;

-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

-  int step1_10, step1_11, step1_12, step1_13;

-  int step2_0, step2_1, step2_2, step2_3;

-  int step2_8, step2_9, step2_10, step2_11;

-  int step2_12, step2_13, step2_14, step2_15;

-  int load1, load2, load3, load4, load5, load6, load7, load8;

-  int result1, result2, result3, result4;

-  const int const_2_power_13 = 8192;

-  for (i = no_rows; i--; ) {

-    /* prefetch row */

-    prefetch_load((const uint8_t *)(input + 16));

-    __asm__ __volatile__ (

-        "lh       %[load1],              0(%[input])                    \n\t"

-        "lh       %[load2],             16(%[input])                    \n\t"

-        "lh       %[load3],              8(%[input])                    \n\t"

-        "lh       %[load4],             24(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "add      %[result1],           %[load1],       %[load2]        \n\t"

-        "sub      %[result2],           %[load1],       %[load2]        \n\t"

-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

-        "extp     %[step2_0],           $ac1,           31              \n\t"

-        "extp     %[step2_1],           $ac2,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_2],           $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

-        "extp     %[step2_3],           $ac1,           31              \n\t"

-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"

-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"

-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"

-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),

-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),

-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load5],             2(%[input])                     \n\t"

-        "lh       %[load6],             30(%[input])                    \n\t"

-        "lh       %[load7],             18(%[input])                    \n\t"

-        "lh       %[load8],             14(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"

-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"

-        "extp     %[result1],           $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"

-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"

-        "extp     %[result2],           $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"

-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"

-        "extp     %[result3],           $ac1,           31              \n\t"

-        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"

-        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"

-        "extp     %[result4],           $ac2,           31              \n\t"

-        "sub      %[load5],             %[result1],     %[result2]      \n\t"

-        "sub      %[load6],             %[result4],     %[result3]      \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"

-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"

-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_9],           $ac1,           31              \n\t"

-        "extp     %[step2_14],          $ac3,           31              \n\t"

-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"

-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6),

-          [load7] "=&r" (load7), [load8] "=&r" (load8),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [result3] "=&r" (result3), [result4] "=&r" (result4),

-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),

-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             10(%[input])                    \n\t"

-        "lh       %[load2],             22(%[input])                    \n\t"

-        "lh       %[load3],             26(%[input])                    \n\t"

-        "lh       %[load4],             6(%[input])                     \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"

-        "extp     %[result1],           $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"

-        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"

-        "extp     %[result2],           $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"

-        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"

-        "extp     %[result3],           $ac1,           31              \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"

-        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"

-        "extp     %[result4],           $ac2,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[result2],     %[result1]      \n\t"

-        "sub      %[load2],             %[result4],     %[result3]      \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_10],          $ac1,           31              \n\t"

-        "extp     %[step2_13],          $ac3,           31              \n\t"

-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"

-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [result3] "=&r" (result3), [result4] "=&r" (result4),

-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load5],             4(%[input])                     \n\t"

-        "lh       %[load6],             28(%[input])                    \n\t"

-        "lh       %[load7],             20(%[input])                    \n\t"

-        "lh       %[load8],             12(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"

-        "extp     %[result1],           $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"

-        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"

-        "extp     %[result2],           $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"

-        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"

-        "extp     %[result3],           $ac1,           31              \n\t"

-        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"

-        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"

-        "extp     %[result4],           $ac2,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load5],             %[result4],     %[result3]      \n\t"

-        "sub      %[load5],             %[load5],       %[result1]      \n\t"

-        "add      %[load5],             %[load5],       %[result2]      \n\t"

-        "sub      %[load6],             %[result1],     %[result2]      \n\t"

-        "sub      %[load6],             %[load6],       %[result3]      \n\t"

-        "add      %[load6],             %[load6],       %[result4]      \n\t"

-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"

-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_5],           $ac1,           31              \n\t"

-        "extp     %[step1_6],           $ac3,           31              \n\t"

-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"

-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6),

-          [load7] "=&r" (load7), [load8] "=&r" (load8),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [result3] "=&r" (result3), [result4] "=&r" (result4),

-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"

-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"

-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"

-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"

-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"

-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"

-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"

-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"

-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"

-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"

-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"

-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"

-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_10],          $ac0,           31              \n\t"

-        "extp     %[step1_13],          $ac1,           31              \n\t"

-        "extp     %[step1_11],          $ac2,           31              \n\t"

-        "extp     %[step1_12],          $ac3,           31              \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6),

-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),

-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),

-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),

-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    __asm__ __volatile__ (

-        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_12]     \n\t"

-        "add      %[load5],             %[load5],       %[step2_15]     \n\t"

-        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"

-        "add      %[load6],             %[load6],       %[step2_13]     \n\t"

-        "add      %[load6],             %[load6],       %[step2_14]     \n\t"

-        "sh       %[load5],             0(%[output])                    \n\t"

-        "sh       %[load6],             32(%[output])                   \n\t"

-        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_9]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"

-        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"

-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"

-        "add      %[load6],             %[load6],       %[step2_11]     \n\t"

-        "sh       %[load5],             192(%[output])                  \n\t"

-        "sh       %[load6],             224(%[output])                  \n\t"

-        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"

-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"

-        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"

-        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"

-        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"

-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"

-        "sh       %[load5],             256(%[output])                  \n\t"

-        "sh       %[load6],             288(%[output])                  \n\t"

-        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"

-        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"

-        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"

-        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"

-        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"

-        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"

-        "sh       %[load5],             448(%[output])                  \n\t"

-        "sh       %[load6],             480(%[output])                  \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6)

-        : [output] "r" (output),

-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

-          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

-          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

-          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

-          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)

-    );

-    __asm__ __volatile__ (

-        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"

-        "add      %[load5],             %[load5],       %[step1_13]     \n\t"

-        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"

-        "add      %[load6],             %[load6],       %[step1_12]     \n\t"

-        "sh       %[load5],             64(%[output])                   \n\t"

-        "sh       %[load6],             96(%[output])                   \n\t"

-        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"

-        "add      %[load5],             %[load5],       %[step1_11]     \n\t"

-        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"

-        "add      %[load6],             %[load6],       %[step1_10]     \n\t"

-        "sh       %[load5],             128(%[output])                  \n\t"

-        "sh       %[load6],             160(%[output])                  \n\t"

-        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"

-        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"

-        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"

-        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"

-        "sh       %[load5],             320(%[output])                  \n\t"

-        "sh       %[load6],             352(%[output])                  \n\t"

-        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"

-        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"

-        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"

-        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"

-        "sh       %[load5],             384(%[output])                  \n\t"

-        "sh       %[load6],             416(%[output])                  \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6)

-        : [output] "r" (output),

-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)

-    );

-    input += 16;

-    output += 1;

-  }

-}

-static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

-                                      int dest_stride) {

-  int i;

-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

-  int step1_8, step1_9, step1_10, step1_11;

-  int step1_12, step1_13, step1_14, step1_15;

-  int step2_0, step2_1, step2_2, step2_3;

-  int step2_8, step2_9, step2_10, step2_11;

-  int step2_12, step2_13, step2_14, step2_15;

-  int load1, load2, load3, load4, load5, load6, load7, load8;

-  int result1, result2, result3, result4;

-  const int const_2_power_13 = 8192;

-  uint8_t *dest_pix;

-  uint8_t *cm = vpx_ff_cropTbl;

-  /* prefetch vpx_ff_cropTbl */

-  prefetch_load(vpx_ff_cropTbl);

-  prefetch_load(vpx_ff_cropTbl +  32);

-  prefetch_load(vpx_ff_cropTbl +  64);

-  prefetch_load(vpx_ff_cropTbl +  96);

-  prefetch_load(vpx_ff_cropTbl + 128);

-  prefetch_load(vpx_ff_cropTbl + 160);

-  prefetch_load(vpx_ff_cropTbl + 192);

-  prefetch_load(vpx_ff_cropTbl + 224);

-  for (i = 0; i < 16; ++i) {

-    dest_pix = (dest + i);

-    __asm__ __volatile__ (

-        "lh       %[load1],              0(%[input])                    \n\t"

-        "lh       %[load2],             16(%[input])                    \n\t"

-        "lh       %[load3],              8(%[input])                    \n\t"

-        "lh       %[load4],             24(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "add      %[result1],           %[load1],       %[load2]        \n\t"

-        "sub      %[result2],           %[load1],       %[load2]        \n\t"

-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

-        "extp     %[step2_0],           $ac1,           31              \n\t"

-        "extp     %[step2_1],           $ac2,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_2],           $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

-        "extp     %[step2_3],           $ac1,           31              \n\t"

-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"

-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"

-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"

-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),

-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),

-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load5],             2(%[input])                     \n\t"

-        "lh       %[load6],             30(%[input])                    \n\t"

-        "lh       %[load7],             18(%[input])                    \n\t"

-        "lh       %[load8],             14(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"

-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"

-        "extp     %[result1],           $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"

-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"

-        "extp     %[result2],           $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"

-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"

-        "extp     %[result3],           $ac1,           31              \n\t"

-        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"

-        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"

-        "extp     %[result4],           $ac2,            31             \n\t"

-        "sub      %[load5],             %[result1],     %[result2]      \n\t"

-        "sub      %[load6],             %[result4],     %[result3]      \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"

-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"

-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_9],           $ac1,           31              \n\t"

-        "extp     %[step2_14],          $ac3,           31              \n\t"

-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"

-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6),

-          [load7] "=&r" (load7), [load8] "=&r" (load8),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [result3] "=&r" (result3), [result4] "=&r" (result4),

-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),

-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             10(%[input])                    \n\t"

-        "lh       %[load2],             22(%[input])                    \n\t"

-        "lh       %[load3],             26(%[input])                    \n\t"

-        "lh       %[load4],             6(%[input])                     \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"

-        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"

-        "extp     %[result1],           $ac1,        31                 \n\t"

-        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"

-        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"

-        "extp     %[result2],           $ac3,        31                 \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"

-        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"

-        "extp     %[result3],           $ac1,        31                 \n\t"

-        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"

-        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"

-        "extp     %[result4],           $ac2,        31                 \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[result2],     %[result1]      \n\t"

-        "sub      %[load2],             %[result4],     %[result3]      \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_10],          $ac1,           31              \n\t"

-        "extp     %[step2_13],          $ac3,           31              \n\t"

-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"

-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [result3] "=&r" (result3), [result4] "=&r" (result4),

-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load5],             4(%[input])                   \n\t"

-        "lh       %[load6],             28(%[input])                  \n\t"

-        "lh       %[load7],             20(%[input])                  \n\t"

-        "lh       %[load8],             12(%[input])                  \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"

-        "mthi     $zero,                $ac1                          \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                          \n\t"

-        "mthi     $zero,                $ac3                          \n\t"

-        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"

-        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"

-        "extp     %[result1],           $ac1,        31               \n\t"

-        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"

-        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"

-        "extp     %[result2],           $ac3,        31               \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"

-        "mthi     $zero,                $ac1                          \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                          \n\t"

-        "mthi     $zero,                $ac2                          \n\t"

-        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"

-        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"

-        "extp     %[result3],           $ac1,        31               \n\t"

-        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"

-        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"

-        "extp     %[result4],           $ac2,        31               \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load5],             %[result4],     %[result3]      \n\t"

-        "sub      %[load5],             %[load5],       %[result1]      \n\t"

-        "add      %[load5],             %[load5],       %[result2]      \n\t"

-        "sub      %[load6],             %[result1],     %[result2]      \n\t"

-        "sub      %[load6],             %[load6],       %[result3]      \n\t"

-        "add      %[load6],             %[load6],       %[result4]      \n\t"

-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"

-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_5],           $ac1,           31              \n\t"

-        "extp     %[step1_6],           $ac3,           31              \n\t"

-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"

-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6),

-          [load7] "=&r" (load7), [load8] "=&r" (load8),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [result3] "=&r" (result3), [result4] "=&r" (result4),

-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"

-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"

-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"

-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"

-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"

-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"

-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"

-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"

-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"

-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"

-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"

-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"

-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"

-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_10],          $ac0,           31              \n\t"

-        "extp     %[step1_13],          $ac1,           31              \n\t"

-        "extp     %[step1_11],          $ac2,           31              \n\t"

-        "extp     %[step1_12],          $ac3,           31              \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6),

-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),

-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),

-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),

-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    step1_8 = step2_8 + step2_11;

-    step1_9 = step2_9 + step2_10;

-    step1_14 = step2_13 + step2_14;

-    step1_15 = step2_12 + step2_15;

-    __asm__ __volatile__ (

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"

-        "add      %[load5],         %[load5],           %[step1_15]     \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"

-        "add      %[load6],         %[load6],           %[step1_14]     \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"

-        "add      %[load5],         %[load5],           %[step1_13]     \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"

-        "add      %[load6],         %[load6],           %[step1_12]     \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"

-        "add      %[load5],         %[load5],           %[step1_11]     \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"

-        "add      %[load6],         %[load6],           %[step1_10]     \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "add      %[load5],         %[load5],           %[step1_9]      \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"

-        "add      %[load6],         %[load6],           %[step1_8]      \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"

-        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"

-        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"

-        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"

-        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"

-        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"

-        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

-        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"

-        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"

-        "addi     %[load5],         %[load5],           32              \n\t"

-        "sra      %[load5],         %[load5],           6               \n\t"

-        "add      %[load7],         %[load7],           %[load5]        \n\t"

-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

-        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"

-        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"

-        "sb       %[load5],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

-        "addi     %[load6],         %[load6],           32              \n\t"

-        "sra      %[load6],         %[load6],           6               \n\t"

-        "add      %[load8],         %[load8],           %[load6]        \n\t"

-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

-        "sb       %[load6],         0(%[dest_pix])                      \n\t"

-        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),

-          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

-          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

-          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)

-    );

-    input += 16;

-  }

-}

-void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

-                                 int dest_stride) {

-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp    %[pos],    1    \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // First transform rows

-  idct16_rows_dspr2(input, out, 16);

-  // Then transform columns and add to dest

-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);

-}

-static void iadst16_dspr2(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

-  int x0 = input[15];

-  int x1 = input[0];

-  int x2 = input[13];

-  int x3 = input[2];

-  int x4 = input[11];

-  int x5 = input[4];

-  int x6 = input[9];

-  int x7 = input[6];

-  int x8 = input[7];

-  int x9 = input[8];

-  int x10 = input[5];

-  int x11 = input[10];

-  int x12 = input[3];

-  int x13 = input[12];

-  int x14 = input[1];

-  int x15 = input[14];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

-    output[0] = output[1] = output[2] = output[3] = output[4]

-              = output[5] = output[6] = output[7] = output[8]

-              = output[9] = output[10] = output[11] = output[12]

-              = output[13] = output[14] = output[15] = 0;

-    return;

-  }

-  // stage 1

-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

-  x0 = dct_const_round_shift(s0 + s8);

-  x1 = dct_const_round_shift(s1 + s9);

-  x2 = dct_const_round_shift(s2 + s10);

-  x3 = dct_const_round_shift(s3 + s11);

-  x4 = dct_const_round_shift(s4 + s12);

-  x5 = dct_const_round_shift(s5 + s13);

-  x6 = dct_const_round_shift(s6 + s14);

-  x7 = dct_const_round_shift(s7 + s15);

-  x8  = dct_const_round_shift(s0 - s8);

-  x9  = dct_const_round_shift(s1 - s9);

-  x10 = dct_const_round_shift(s2 - s10);

-  x11 = dct_const_round_shift(s3 - s11);

-  x12 = dct_const_round_shift(s4 - s12);

-  x13 = dct_const_round_shift(s5 - s13);

-  x14 = dct_const_round_shift(s6 - s14);

-  x15 = dct_const_round_shift(s7 - s15);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4;

-  s5 = x5;

-  s6 = x6;

-  s7 = x7;

-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

-  x0 = s0 + s4;

-  x1 = s1 + s5;

-  x2 = s2 + s6;

-  x3 = s3 + s7;

-  x4 = s0 - s4;

-  x5 = s1 - s5;

-  x6 = s2 - s6;

-  x7 = s3 - s7;

-  x8 = dct_const_round_shift(s8 + s12);

-  x9 = dct_const_round_shift(s9 + s13);

-  x10 = dct_const_round_shift(s10 + s14);

-  x11 = dct_const_round_shift(s11 + s15);

-  x12 = dct_const_round_shift(s8 - s12);

-  x13 = dct_const_round_shift(s9 - s13);

-  x14 = dct_const_round_shift(s10 - s14);

-  x15 = dct_const_round_shift(s11 - s15);

-  // stage 3

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

-  s8 = x8;

-  s9 = x9;

-  s10 = x10;

-  s11 = x11;

-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

-  x0 = s0 + s2;

-  x1 = s1 + s3;

-  x2 = s0 - s2;

-  x3 = s1 - s3;

-  x4 = dct_const_round_shift(s4 + s6);

-  x5 = dct_const_round_shift(s5 + s7);

-  x6 = dct_const_round_shift(s4 - s6);

-  x7 = dct_const_round_shift(s5 - s7);

-  x8 = s8 + s10;

-  x9 = s9 + s11;

-  x10 = s8 - s10;

-  x11 = s9 - s11;

-  x12 = dct_const_round_shift(s12 + s14);

-  x13 = dct_const_round_shift(s13 + s15);

-  x14 = dct_const_round_shift(s12 - s14);

-  x15 = dct_const_round_shift(s13 - s15);

-  // stage 4

-  s2 = (- cospi_16_64) * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (- x6 + x7);

-  s10 = cospi_16_64 * (x10 + x11);

-  s11 = cospi_16_64 * (- x10 + x11);

-  s14 = (- cospi_16_64) * (x14 + x15);

-  s15 = cospi_16_64 * (x14 - x15);

-  x2 = dct_const_round_shift(s2);

-  x3 = dct_const_round_shift(s3);

-  x6 = dct_const_round_shift(s6);

-  x7 = dct_const_round_shift(s7);

-  x10 = dct_const_round_shift(s10);

-  x11 = dct_const_round_shift(s11);

-  x14 = dct_const_round_shift(s14);

-  x15 = dct_const_round_shift(s15);

-  output[0] =  x0;

-  output[1] = -x8;

-  output[2] =  x12;

-  output[3] = -x4;

-  output[4] =  x6;

-  output[5] =  x14;

-  output[6] =  x10;

-  output[7] =  x2;

-  output[8] =  x3;

-  output[9] =  x11;

-  output[10] =  x15;

-  output[11] =  x7;

-  output[12] =  x5;

-  output[13] = -x13;

-  output[14] =  x9;

-  output[15] = -x1;

-}

 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

                                 int pitch, int tx_type) {

   int i, j;

@@ -1166,152 +103,6 @@

     default:

       printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");

       break;

-  }

-}

-void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,

-                                int dest_stride) {

-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);

-  int16_t *outptr = out;

-  uint32_t i;

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp    %[pos],    1    \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // First transform rows. Since all non-zero dct coefficients are in

-  // upper-left 4x4 area, we only need to calculate first 4 rows here.

-  idct16_rows_dspr2(input, outptr, 4);

-  outptr += 4;

-  for (i = 0; i < 6; ++i) {

-    __asm__ __volatile__ (

-        "sw     $zero,    0(%[outptr])     \n\t"

-        "sw     $zero,   32(%[outptr])     \n\t"

-        "sw     $zero,   64(%[outptr])     \n\t"

-        "sw     $zero,   96(%[outptr])     \n\t"

-        "sw     $zero,  128(%[outptr])     \n\t"

-        "sw     $zero,  160(%[outptr])     \n\t"

-        "sw     $zero,  192(%[outptr])     \n\t"

-        "sw     $zero,  224(%[outptr])     \n\t"

-        "sw     $zero,  256(%[outptr])     \n\t"

-        "sw     $zero,  288(%[outptr])     \n\t"

-        "sw     $zero,  320(%[outptr])     \n\t"

-        "sw     $zero,  352(%[outptr])     \n\t"

-        "sw     $zero,  384(%[outptr])     \n\t"

-        "sw     $zero,  416(%[outptr])     \n\t"

-        "sw     $zero,  448(%[outptr])     \n\t"

-        "sw     $zero,  480(%[outptr])     \n\t"

-        :

-        : [outptr] "r" (outptr)

-    );

-    outptr += 2;

-  }

-  // Then transform columns

-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);

-}

-void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,

-                               int dest_stride) {

-  uint32_t pos = 45;

-  int32_t out;

-  int32_t r;

-  int32_t a1, absa1;

-  int32_t vector_a1;

-  int32_t t1, t2, t3, t4;

-  int32_t vector_1, vector_2, vector_3, vector_4;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

-  __asm__ __volatile__ (

-      "addi     %[out],     %[out],     32      \n\t"

-      "sra      %[a1],      %[out],     6       \n\t"

-      : [out] "+r" (out), [a1] "=r" (a1)

-      :

-  );

-  if (a1 < 0) {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "abs        %[absa1],       %[a1]       \n\t"

-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"

-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 16; r--;) {

-      __asm__ __volatile__ (

-          "lw             %[t1],          0(%[dest])                      \n\t"

-          "lw             %[t2],          4(%[dest])                      \n\t"

-          "lw             %[t3],          8(%[dest])                      \n\t"

-          "lw             %[t4],          12(%[dest])                     \n\t"

-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

-          "sw             %[vector_1],    0(%[dest])                      \n\t"

-          "sw             %[vector_2],    4(%[dest])                      \n\t"

-          "sw             %[vector_3],    8(%[dest])                      \n\t"

-          "sw             %[vector_4],    12(%[dest])                     \n\t"

-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"

-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

-            [dest] "+&r" (dest)

-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

-      );

-    }

-  } else {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "replv.qb   %[vector_a1],   %[a1]   \n\t"

-        : [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 16; r--;) {

-      __asm__ __volatile__ (

-          "lw             %[t1],          0(%[dest])                      \n\t"

-          "lw             %[t2],          4(%[dest])                      \n\t"

-          "lw             %[t3],          8(%[dest])                      \n\t"

-          "lw             %[t4],          12(%[dest])                     \n\t"

-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

-          "sw             %[vector_1],    0(%[dest])                      \n\t"

-          "sw             %[vector_2],    4(%[dest])                      \n\t"

-          "sw             %[vector_3],    8(%[dest])                      \n\t"

-          "sw             %[vector_4],    12(%[dest])                     \n\t"

-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"

-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

-            [dest] "+&r" (dest)

-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

-      );

-    }

 #endif  // #if HAVE_DSPR2

--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c

+++ /dev/null

@@ -1,1074 +1,0 @@

-/*

- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h>

-#include "./vpx_config.h"

-#include "./vp9_rtcd.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

-#include "vpx_dsp/txfm_common.h"

-#include "vpx_ports/mem.h"

-#if HAVE_DSPR2

-void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

-                                   int dest_stride) {

-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;

-  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;

-  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;

-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

-  int16_t step2_28, step2_29, step2_30, step2_31;

-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;

-  int16_t step3_28, step3_29, step3_30, step3_31;

-  int temp0, temp1, temp2, temp3;

-  int load1, load2, load3, load4;

-  int result1, result2;

-  int i, temp21;

-  uint8_t *dest_pix, *dest_pix1;

-  const int const_2_power_13 = 8192;

-  uint8_t *cm = vpx_ff_cropTbl;

-  /* prefetch vpx_ff_cropTbl */

-  prefetch_load(vpx_ff_cropTbl);

-  prefetch_load(vpx_ff_cropTbl +  32);

-  prefetch_load(vpx_ff_cropTbl +  64);

-  prefetch_load(vpx_ff_cropTbl +  96);

-  prefetch_load(vpx_ff_cropTbl + 128);

-  prefetch_load(vpx_ff_cropTbl + 160);

-  prefetch_load(vpx_ff_cropTbl + 192);

-  prefetch_load(vpx_ff_cropTbl + 224);

-  for (i = 0; i < 32; ++i) {

-    dest_pix = dest + i;

-    dest_pix1 = dest + i + 31 * dest_stride;

-    __asm__ __volatile__ (

-        "lh       %[load1],             2(%[input])                     \n\t"

-        "lh       %[load2],             62(%[input])                    \n\t"

-        "lh       %[load3],             34(%[input])                    \n\t"

-        "lh       %[load4],             30(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

-        "extp     %[step1_17],          $ac1,           31              \n\t"

-        "extp     %[step1_30],          $ac3,           31              \n\t"

-        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

-          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

-          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             18(%[input])                    \n\t"

-        "lh       %[load2],             46(%[input])                    \n\t"

-        "lh       %[load3],             50(%[input])                    \n\t"

-        "lh       %[load4],             14(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

-        "extp     %[step1_18],          $ac1,           31              \n\t"

-        "extp     %[step1_29],          $ac3,           31              \n\t"

-        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

-          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

-          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             10(%[input])                    \n\t"

-        "lh       %[load2],             54(%[input])                    \n\t"

-        "lh       %[load3],             42(%[input])                    \n\t"

-        "lh       %[load4],             22(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

-        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"

-        "extp     %[step1_21],          $ac1,           31              \n\t"

-        "extp     %[step1_26],          $ac3,           31              \n\t"

-        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

-          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

-          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             26(%[input])                    \n\t"

-        "lh       %[load2],             38(%[input])                    \n\t"

-        "lh       %[load3],             58(%[input])                    \n\t"

-        "lh       %[load4],              6(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"

-        "extp     %[step1_22],          $ac1,           31              \n\t"

-        "extp     %[step1_25],          $ac3,           31              \n\t"

-        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

-          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

-          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],              4(%[input])                    \n\t"

-        "lh       %[load2],             60(%[input])                    \n\t"

-        "lh       %[load3],             36(%[input])                    \n\t"

-        "lh       %[load4],             28(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_9],           $ac1,           31              \n\t"

-        "extp     %[step2_14],          $ac3,           31              \n\t"

-        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"

-        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

-          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             20(%[input])                    \n\t"

-        "lh       %[load2],             44(%[input])                    \n\t"

-        "lh       %[load3],             52(%[input])                    \n\t"

-        "lh       %[load4],             12(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_10],          $ac1,           31              \n\t"

-        "extp     %[step2_13],          $ac3,           31              \n\t"

-        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

-    );

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"

-        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"

-        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"

-        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"

-        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"

-        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"

-        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"

-        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"

-        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"

-        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"

-        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"

-        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"

-        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"

-        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"

-        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"

-        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"

-        "extp     %[step3_10],          $ac0,           31              \n\t"

-        "extp     %[step3_13],          $ac1,           31              \n\t"

-        "extp     %[step3_11],          $ac2,           31              \n\t"

-        "extp     %[step3_12],          $ac3,           31              \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

-          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

-          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

-          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

-        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),

-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

-          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),

-          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),

-          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)

-    );

-    step2_18 = step1_17 - step1_18;

-    step2_29 = step1_30 - step1_29;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"

-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"

-        "extp     %[step3_18],          $ac0,           31              \n\t"

-        : [step3_18] "=r" (step3_18)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step2_19 = step1_16 - step1_19;

-    step2_28 = step1_31 - step1_28;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"

-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"

-        "extp     %[step3_19],          $ac0,           31              \n\t"

-        : [step3_19] "=r" (step3_19)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step3_16 = step1_16 + step1_19;

-    step3_17 = step1_17 + step1_18;

-    step3_30 = step1_29 + step1_30;

-    step3_31 = step1_28 + step1_31;

-    step2_20 = step1_23 - step1_20;

-    step2_27 = step1_24 - step1_27;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"

-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"

-        "extp     %[step3_20],          $ac0,           31              \n\t"

-        : [step3_20] "=r" (step3_20)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step2_21 = step1_22 - step1_21;

-    step2_26 = step1_25 - step1_26;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"

-        "extp     %[step3_21],          $ac1,           31              \n\t"

-        : [step3_21] "=r" (step3_21)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step3_22 = step1_21 + step1_22;

-    step3_23 = step1_20 + step1_23;

-    step3_24 = step1_24 + step1_27;

-    step3_25 = step1_25 + step1_26;

-    step2_16 = step3_16 + step3_23;

-    step2_17 = step3_17 + step3_22;

-    step2_18 = step3_18 + step3_21;

-    step2_19 = step3_19 + step3_20;

-    step2_20 = step3_19 - step3_20;

-    step2_21 = step3_18 - step3_21;

-    step2_22 = step3_17 - step3_22;

-    step2_23 = step3_16 - step3_23;

-    step2_24 = step3_31 - step3_24;

-    step2_25 = step3_30 - step3_25;

-    step2_26 = step3_29 - step3_26;

-    step2_27 = step3_28 - step3_27;

-    step2_28 = step3_28 + step3_27;

-    step2_29 = step3_29 + step3_26;

-    step2_30 = step3_30 + step3_25;

-    step2_31 = step3_31 + step3_24;

-    __asm__ __volatile__ (

-        "lh       %[load1],             0(%[input])                     \n\t"

-        "lh       %[load2],             32(%[input])                    \n\t"

-        "lh       %[load3],             16(%[input])                    \n\t"

-        "lh       %[load4],             48(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "add      %[result1],           %[load1],       %[load2]        \n\t"

-        "sub      %[result2],           %[load1],       %[load2]        \n\t"

-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

-        "extp     %[temp2],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

-        "extp     %[temp3],             $ac1,           31              \n\t"

-        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"

-        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"

-        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"

-        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             8(%[input])                     \n\t"

-        "lh       %[load2],             56(%[input])                    \n\t"

-        "lh       %[load3],             40(%[input])                    \n\t"

-        "lh       %[load4],             24(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

-        "sub      %[load1],             %[load1],       %[temp0]        \n\t"

-        "add      %[load1],             %[load1],       %[temp1]        \n\t"

-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

-        "sub      %[load2],             %[load2],       %[temp2]        \n\t"

-        "add      %[load2],             %[load2],       %[temp3]        \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_5],           $ac1,           31              \n\t"

-        "extp     %[step1_6],           $ac3,           31              \n\t"

-        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    step2_0 = step1_0 + step1_7;

-    step2_1 = step1_1 + step1_6;

-    step2_2 = step1_2 + step1_5;

-    step2_3 = step1_3 + step1_4;

-    step2_4 = step1_3 - step1_4;

-    step2_5 = step1_2 - step1_5;

-    step2_6 = step1_1 - step1_6;

-    step2_7 = step1_0 - step1_7;

-    // stage 7

-    step1_0 = step2_0 + step3_15;

-    step1_1 = step2_1 + step3_14;

-    step1_2 = step2_2 + step3_13;

-    step1_3 = step2_3 + step3_12;

-    step1_4 = step2_4 + step3_11;

-    step1_5 = step2_5 + step3_10;

-    step1_6 = step2_6 + step3_9;

-    step1_7 = step2_7 + step3_8;

-    step1_8 = step2_7 - step3_8;

-    step1_9 = step2_6 - step3_9;

-    step1_10 = step2_5 - step3_10;

-    step1_11 = step2_4 - step3_11;

-    step1_12 = step2_3 - step3_12;

-    step1_13 = step2_2 - step3_13;

-    step1_14 = step2_1 - step3_14;

-    step1_15 = step2_0 - step3_15;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_20],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

-        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),

-          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_20 + step2_27) * cospi_16_64;

-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_21],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

-        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),

-          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_21 + step2_26) * cospi_16_64;

-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_22],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

-        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),

-          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_22 + step2_25) * cospi_16_64;

-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_23],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

-        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),

-          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_23 + step2_24) * cospi_16_64;

-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

-          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),

-          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)

-    );

-    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);

-    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);

-    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);

-    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

-    );

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

-          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),

-          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)

-    );

-    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);

-    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);

-    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);

-    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

-    );

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

-          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),

-          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)

-    );

-    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);

-    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);

-    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);

-    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

-    );

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

-        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"

-        "addi     %[temp0],         %[temp0],           32              \n\t"

-        "sra      %[temp0],         %[temp0],           6               \n\t"

-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"

-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

-        "addi     %[temp1],         %[temp1],           32              \n\t"

-        "sra      %[temp1],         %[temp1],           6               \n\t"

-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

-          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),

-          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),

-          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)

-    );

-    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);

-    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);

-    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);

-    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);

-    __asm__ __volatile__ (

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

-    );

-    input += 32;

-  }

-}

-#endif  // #if HAVE_DSPR2

--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

+++ /dev/null

@@ -1,1076 +1,0 @@

-/*

- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h>

-#include <stdio.h>

-#include "./vpx_config.h"

-#include "./vp9_rtcd.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

-#include "vpx_dsp/txfm_common.h"

-#if HAVE_DSPR2

-static void idct32_rows_dspr2(const int16_t *input, int16_t *output,

-                              uint32_t no_rows) {

-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;

-  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;

-  int16_t step1_28, step1_29, step1_30, step1_31;

-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

-  int16_t step2_28, step2_29, step2_30, step2_31;

-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;

-  int16_t step3_29, step3_30, step3_31;

-  int temp0, temp1, temp2, temp3;

-  int load1, load2, load3, load4;

-  int result1, result2;

-  int temp21;

-  int i;

-  const int const_2_power_13 = 8192;

-  const int32_t *input_int;

-  for (i = no_rows; i--; ) {

-    input_int = (const int32_t *)input;

-    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |

-          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |

-          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |

-          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {

-      input += 32;

-      __asm__ __volatile__ (

-          "sh     $zero,     0(%[output])     \n\t"

-          "sh     $zero,    64(%[output])     \n\t"

-          "sh     $zero,   128(%[output])     \n\t"

-          "sh     $zero,   192(%[output])     \n\t"

-          "sh     $zero,   256(%[output])     \n\t"

-          "sh     $zero,   320(%[output])     \n\t"

-          "sh     $zero,   384(%[output])     \n\t"

-          "sh     $zero,   448(%[output])     \n\t"

-          "sh     $zero,   512(%[output])     \n\t"

-          "sh     $zero,   576(%[output])     \n\t"

-          "sh     $zero,   640(%[output])     \n\t"

-          "sh     $zero,   704(%[output])     \n\t"

-          "sh     $zero,   768(%[output])     \n\t"

-          "sh     $zero,   832(%[output])     \n\t"

-          "sh     $zero,   896(%[output])     \n\t"

-          "sh     $zero,   960(%[output])     \n\t"

-          "sh     $zero,  1024(%[output])     \n\t"

-          "sh     $zero,  1088(%[output])     \n\t"

-          "sh     $zero,  1152(%[output])     \n\t"

-          "sh     $zero,  1216(%[output])     \n\t"

-          "sh     $zero,  1280(%[output])     \n\t"

-          "sh     $zero,  1344(%[output])     \n\t"

-          "sh     $zero,  1408(%[output])     \n\t"

-          "sh     $zero,  1472(%[output])     \n\t"

-          "sh     $zero,  1536(%[output])     \n\t"

-          "sh     $zero,  1600(%[output])     \n\t"

-          "sh     $zero,  1664(%[output])     \n\t"

-          "sh     $zero,  1728(%[output])     \n\t"

-          "sh     $zero,  1792(%[output])     \n\t"

-          "sh     $zero,  1856(%[output])     \n\t"

-          "sh     $zero,  1920(%[output])     \n\t"

-          "sh     $zero,  1984(%[output])     \n\t"

-          :

-          : [output] "r" (output)

-      );

-      output += 1;

-      continue;

-    }

-    /* prefetch row */

-    prefetch_load((const uint8_t *)(input + 32));

-    prefetch_load((const uint8_t *)(input + 48));

-    __asm__ __volatile__ (

-        "lh       %[load1],             2(%[input])                     \n\t"

-        "lh       %[load2],             62(%[input])                    \n\t"

-        "lh       %[load3],             34(%[input])                    \n\t"

-        "lh       %[load4],             30(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

-        "extp     %[step1_17],          $ac1,           31              \n\t"

-        "extp     %[step1_30],          $ac3,           31              \n\t"

-        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

-          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

-          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             18(%[input])                    \n\t"

-        "lh       %[load2],             46(%[input])                    \n\t"

-        "lh       %[load3],             50(%[input])                    \n\t"

-        "lh       %[load4],             14(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

-        "extp     %[step1_18],          $ac1,           31              \n\t"

-        "extp     %[step1_29],          $ac3,           31              \n\t"

-        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

-          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

-          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             10(%[input])                    \n\t"

-        "lh       %[load2],             54(%[input])                    \n\t"

-        "lh       %[load3],             42(%[input])                    \n\t"

-        "lh       %[load4],             22(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

-        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"

-        "extp     %[step1_21],          $ac1,           31              \n\t"

-        "extp     %[step1_26],          $ac3,           31              \n\t"

-        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

-          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

-          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             26(%[input])                    \n\t"

-        "lh       %[load2],             38(%[input])                    \n\t"

-        "lh       %[load3],             58(%[input])                    \n\t"

-        "lh       %[load4],              6(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"

-        "extp     %[step1_22],          $ac1,           31              \n\t"

-        "extp     %[step1_25],          $ac3,           31              \n\t"

-        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

-          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

-          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],              4(%[input])                    \n\t"

-        "lh       %[load2],             60(%[input])                    \n\t"

-        "lh       %[load3],             36(%[input])                    \n\t"

-        "lh       %[load4],             28(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_9],           $ac1,           31              \n\t"

-        "extp     %[step2_14],          $ac3,           31              \n\t"

-        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"

-        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

-          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             20(%[input])                    \n\t"

-        "lh       %[load2],             44(%[input])                    \n\t"

-        "lh       %[load3],             52(%[input])                    \n\t"

-        "lh       %[load4],             12(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

-        "extp     %[step2_10],          $ac1,           31              \n\t"

-        "extp     %[step2_13],          $ac3,           31              \n\t"

-        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"

-        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

-    );

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"

-        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"

-        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"

-        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"

-        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"

-        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"

-        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"

-        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"

-        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"

-        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"

-        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"

-        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"

-        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"

-        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"

-        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"

-        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"

-        "extp     %[step3_10],          $ac0,           31              \n\t"

-        "extp     %[step3_13],          $ac1,           31              \n\t"

-        "extp     %[step3_11],          $ac2,           31              \n\t"

-        "extp     %[step3_12],          $ac3,           31              \n\t"

-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

-          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

-          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

-          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

-          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

-          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

-          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    step2_18 = step1_17 - step1_18;

-    step2_29 = step1_30 - step1_29;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"

-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"

-        "extp     %[step3_18],          $ac0,           31              \n\t"

-        : [step3_18] "=r" (step3_18)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step2_19 = step1_16 - step1_19;

-    step2_28 = step1_31 - step1_28;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"

-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"

-        "extp     %[step3_19],          $ac0,           31              \n\t"

-        : [step3_19] "=r" (step3_19)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step3_16 = step1_16 + step1_19;

-    step3_17 = step1_17 + step1_18;

-    step3_30 = step1_29 + step1_30;

-    step3_31 = step1_28 + step1_31;

-    step2_20 = step1_23 - step1_20;

-    step2_27 = step1_24 - step1_27;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"

-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"

-        "extp     %[step3_20],          $ac0,           31              \n\t"

-        : [step3_20] "=r" (step3_20)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step2_21 = step1_22 - step1_21;

-    step2_26 = step1_25 - step1_26;

-    __asm__ __volatile__ (

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"

-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"

-        "extp     %[step3_21],          $ac1,           31              \n\t"

-        : [step3_21] "=r" (step3_21)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    step3_22 = step1_21 + step1_22;

-    step3_23 = step1_20 + step1_23;

-    step3_24 = step1_24 + step1_27;

-    step3_25 = step1_25 + step1_26;

-    step2_16 = step3_16 + step3_23;

-    step2_17 = step3_17 + step3_22;

-    step2_18 = step3_18 + step3_21;

-    step2_19 = step3_19 + step3_20;

-    step2_20 = step3_19 - step3_20;

-    step2_21 = step3_18 - step3_21;

-    step2_22 = step3_17 - step3_22;

-    step2_23 = step3_16 - step3_23;

-    step2_24 = step3_31 - step3_24;

-    step2_25 = step3_30 - step3_25;

-    step2_26 = step3_29 - step3_26;

-    step2_27 = step3_28 - step3_27;

-    step2_28 = step3_28 + step3_27;

-    step2_29 = step3_29 + step3_26;

-    step2_30 = step3_30 + step3_25;

-    step2_31 = step3_31 + step3_24;

-    __asm__ __volatile__ (

-        "lh       %[load1],             0(%[input])                     \n\t"

-        "lh       %[load2],             32(%[input])                    \n\t"

-        "lh       %[load3],             16(%[input])                    \n\t"

-        "lh       %[load4],             48(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "add      %[result1],           %[load1],       %[load2]        \n\t"

-        "sub      %[result2],           %[load1],       %[load2]        \n\t"

-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

-        "extp     %[temp2],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

-        "extp     %[temp3],             $ac1,           31              \n\t"

-        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"

-        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"

-        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"

-        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [result1] "=&r" (result1), [result2] "=&r" (result2),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_16_64] "r" (cospi_16_64),

-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

-    );

-    __asm__ __volatile__ (

-        "lh       %[load1],             8(%[input])                     \n\t"

-        "lh       %[load2],             56(%[input])                    \n\t"

-        "lh       %[load3],             40(%[input])                    \n\t"

-        "lh       %[load4],             24(%[input])                    \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

-        "extp     %[temp0],             $ac1,           31              \n\t"

-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

-        "extp     %[temp3],             $ac3,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

-        "mthi     $zero,                $ac2                            \n\t"

-        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"

-        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"

-        "extp     %[temp1],             $ac2,           31              \n\t"

-        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"

-        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"

-        "extp     %[temp2],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

-        "mthi     $zero,                $ac3                            \n\t"

-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

-        "sub      %[load1],             %[load1],       %[temp0]        \n\t"

-        "add      %[load1],             %[load1],       %[temp1]        \n\t"

-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

-        "sub      %[load2],             %[load2],       %[temp2]        \n\t"

-        "add      %[load2],             %[load2],       %[temp3]        \n\t"

-        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"

-        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_5],           $ac1,           31              \n\t"

-        "extp     %[step1_6],           $ac3,           31              \n\t"

-        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"

-        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"

-        : [load1] "=&r" (load1), [load2] "=&r" (load2),

-          [load3] "=&r" (load3), [load4] "=&r" (load4),

-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    step2_0 = step1_0 + step1_7;

-    step2_1 = step1_1 + step1_6;

-    step2_2 = step1_2 + step1_5;

-    step2_3 = step1_3 + step1_4;

-    step2_4 = step1_3 - step1_4;

-    step2_5 = step1_2 - step1_5;

-    step2_6 = step1_1 - step1_6;

-    step2_7 = step1_0 - step1_7;

-    step1_0 = step2_0 + step3_15;

-    step1_1 = step2_1 + step3_14;

-    step1_2 = step2_2 + step3_13;

-    step1_3 = step2_3 + step3_12;

-    step1_4 = step2_4 + step3_11;

-    step1_5 = step2_5 + step3_10;

-    step1_6 = step2_6 + step3_9;

-    step1_7 = step2_7 + step3_8;

-    step1_8 = step2_7 - step3_8;

-    step1_9 = step2_6 - step3_9;

-    step1_10 = step2_5 - step3_10;

-    step1_11 = step2_4 - step3_11;

-    step1_12 = step2_3 - step3_12;

-    step1_13 = step2_2 - step3_13;

-    step1_14 = step2_1 - step3_14;

-    step1_15 = step2_0 - step3_15;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_20],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_20 + step2_27) * cospi_16_64;

-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_21],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_21 + step2_26) * cospi_16_64;

-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_22],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_22 + step2_25) * cospi_16_64;

-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    __asm__ __volatile__ (

-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_23],          $ac0,           31              \n\t"

-        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),

-          [cospi_16_64] "r" (cospi_16_64)

-    );

-    temp21 = (step2_23 + step2_24) * cospi_16_64;

-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

-    // final stage

-    output[0 * 32] = step1_0 + step2_31;

-    output[1 * 32] = step1_1 + step2_30;

-    output[2 * 32] = step1_2 + step2_29;

-    output[3 * 32] = step1_3 + step2_28;

-    output[4 * 32] = step1_4 + step1_27;

-    output[5 * 32] = step1_5 + step1_26;

-    output[6 * 32] = step1_6 + step1_25;

-    output[7 * 32] = step1_7 + step1_24;

-    output[8 * 32] = step1_8 + step1_23;

-    output[9 * 32] = step1_9 + step1_22;

-    output[10 * 32] = step1_10 + step1_21;

-    output[11 * 32] = step1_11 + step1_20;

-    output[12 * 32] = step1_12 + step2_19;

-    output[13 * 32] = step1_13 + step2_18;

-    output[14 * 32] = step1_14 + step2_17;

-    output[15 * 32] = step1_15 + step2_16;

-    output[16 * 32] = step1_15 - step2_16;

-    output[17 * 32] = step1_14 - step2_17;

-    output[18 * 32] = step1_13 - step2_18;

-    output[19 * 32] = step1_12 - step2_19;

-    output[20 * 32] = step1_11 - step1_20;

-    output[21 * 32] = step1_10 - step1_21;

-    output[22 * 32] = step1_9 - step1_22;

-    output[23 * 32] = step1_8 - step1_23;

-    output[24 * 32] = step1_7 - step1_24;

-    output[25 * 32] = step1_6 - step1_25;

-    output[26 * 32] = step1_5 - step1_26;

-    output[27 * 32] = step1_4 - step1_27;

-    output[28 * 32] = step1_3 - step2_28;

-    output[29 * 32] = step1_2 - step2_29;

-    output[30 * 32] = step1_1 - step2_30;

-    output[31 * 32] = step1_0 - step2_31;

-    input += 32;

-    output += 1;

-  }

-}

-void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,

-                                  int dest_stride) {

-  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);

-  int16_t *outptr = out;

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // Rows

-  idct32_rows_dspr2(input, outptr, 32);

-  // Columns

-  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);

-}

-void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,

-                                int stride) {

-  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);

-  int16_t *outptr = out;

-  uint32_t i;

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // Rows

-  idct32_rows_dspr2(input, outptr, 8);

-  outptr += 8;

-  __asm__ __volatile__ (

-      "sw     $zero,      0(%[outptr])     \n\t"

-      "sw     $zero,      4(%[outptr])     \n\t"

-      "sw     $zero,      8(%[outptr])     \n\t"

-      "sw     $zero,     12(%[outptr])     \n\t"

-      "sw     $zero,     16(%[outptr])     \n\t"

-      "sw     $zero,     20(%[outptr])     \n\t"

-      "sw     $zero,     24(%[outptr])     \n\t"

-      "sw     $zero,     28(%[outptr])     \n\t"

-      "sw     $zero,     32(%[outptr])     \n\t"

-      "sw     $zero,     36(%[outptr])     \n\t"

-      "sw     $zero,     40(%[outptr])     \n\t"

-      "sw     $zero,     44(%[outptr])     \n\t"

-      :

-      : [outptr] "r" (outptr)

-  );

-  for (i = 0; i < 31; ++i) {

-    outptr += 32;

-    __asm__ __volatile__ (

-        "sw     $zero,      0(%[outptr])     \n\t"

-        "sw     $zero,      4(%[outptr])     \n\t"

-        "sw     $zero,      8(%[outptr])     \n\t"

-        "sw     $zero,     12(%[outptr])     \n\t"

-        "sw     $zero,     16(%[outptr])     \n\t"

-        "sw     $zero,     20(%[outptr])     \n\t"

-        "sw     $zero,     24(%[outptr])     \n\t"

-        "sw     $zero,     28(%[outptr])     \n\t"

-        "sw     $zero,     32(%[outptr])     \n\t"

-        "sw     $zero,     36(%[outptr])     \n\t"

-        "sw     $zero,     40(%[outptr])     \n\t"

-        "sw     $zero,     44(%[outptr])     \n\t"

-        :

-        : [outptr] "r" (outptr)

-    );

-  }

-  // Columns

-  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);

-}

-void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,

-                               int stride) {

-  int       r, out;

-  int32_t   a1, absa1;

-  int32_t   vector_a1;

-  int32_t   t1, t2, t3, t4;

-  int32_t   vector_1, vector_2, vector_3, vector_4;

-  uint32_t  pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

-  __asm__ __volatile__ (

-      "addi     %[out],    %[out],    32      \n\t"

-      "sra      %[a1],     %[out],    6       \n\t"

-      : [out] "+r" (out), [a1] "=r" (a1)

-      :

-  );

-  if (a1 < 0) {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "abs        %[absa1],     %[a1]         \n\t"

-        "replv.qb   %[vector_a1], %[absa1]      \n\t"

-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 32; r--;) {

-      __asm__ __volatile__ (

-          "lw             %[t1],          0(%[dest])                      \n\t"

-          "lw             %[t2],          4(%[dest])                      \n\t"

-          "lw             %[t3],          8(%[dest])                      \n\t"

-          "lw             %[t4],          12(%[dest])                     \n\t"

-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

-          "sw             %[vector_1],    0(%[dest])                      \n\t"

-          "sw             %[vector_2],    4(%[dest])                      \n\t"

-          "sw             %[vector_3],    8(%[dest])                      \n\t"

-          "sw             %[vector_4],    12(%[dest])                     \n\t"

-          "lw             %[t1],          16(%[dest])                     \n\t"

-          "lw             %[t2],          20(%[dest])                     \n\t"

-          "lw             %[t3],          24(%[dest])                     \n\t"

-          "lw             %[t4],          28(%[dest])                     \n\t"

-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

-          "sw             %[vector_1],    16(%[dest])                     \n\t"

-          "sw             %[vector_2],    20(%[dest])                     \n\t"

-          "sw             %[vector_3],    24(%[dest])                     \n\t"

-          "sw             %[vector_4],    28(%[dest])                     \n\t"

-          "add            %[dest],        %[dest],        %[stride]       \n\t"

-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

-            [dest] "+&r" (dest)

-          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

-      );

-    }

-  } else {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "replv.qb       %[vector_a1],   %[a1]     \n\t"

-        : [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 32; r--;) {

-      __asm__ __volatile__ (

-          "lw             %[t1],          0(%[dest])                      \n\t"

-          "lw             %[t2],          4(%[dest])                      \n\t"

-          "lw             %[t3],          8(%[dest])                      \n\t"

-          "lw             %[t4],          12(%[dest])                     \n\t"

-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

-          "sw             %[vector_1],    0(%[dest])                      \n\t"

-          "sw             %[vector_2],    4(%[dest])                      \n\t"

-          "sw             %[vector_3],    8(%[dest])                      \n\t"

-          "sw             %[vector_4],    12(%[dest])                     \n\t"

-          "lw             %[t1],          16(%[dest])                     \n\t"

-          "lw             %[t2],          20(%[dest])                     \n\t"

-          "lw             %[t3],          24(%[dest])                     \n\t"

-          "lw             %[t4],          28(%[dest])                     \n\t"

-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

-          "sw             %[vector_1],    16(%[dest])                     \n\t"

-          "sw             %[vector_2],    20(%[dest])                     \n\t"

-          "sw             %[vector_3],    24(%[dest])                     \n\t"

-          "sw             %[vector_4],    28(%[dest])                     \n\t"

-          "add            %[dest],        %[dest],        %[stride]       \n\t"

-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

-            [dest] "+&r" (dest)

-          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

-      );

-    }

-  }

-}

-#endif  // #if HAVE_DSPR2

--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

@@ -16,354 +16,11 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_idct.h"

-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

 #include "vpx_dsp/txfm_common.h"

 #include "vpx_ports/mem.h"

 #if HAVE_DSPR2

-static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {

-  int16_t   step_0, step_1, step_2, step_3;

-  int       Temp0, Temp1, Temp2, Temp3;

-  const int const_2_power_13 = 8192;

-  int       i;

-  for (i = 4; i--; ) {

-    __asm__ __volatile__ (

-        /*

-          temp_1 = (input[0] + input[2]) * cospi_16_64;

-          step_0 = dct_const_round_shift(temp_1);

-          temp_2 = (input[0] - input[2]) * cospi_16_64;

-          step_1 = dct_const_round_shift(temp_2);

-        */

-        "lh       %[Temp0],             0(%[input])                     \n\t"

-        "lh       %[Temp1],             4(%[input])                     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

-        "lh       %[Temp0],             2(%[input])                     \n\t"

-        "lh       %[Temp1],             6(%[input])                     \n\t"

-        "extp     %[step_0],            $ac0,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

-        "extp     %[step_1],            $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        /*

-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

-          step_2 = dct_const_round_shift(temp1);

-        */

-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

-        "extp     %[step_2],            $ac0,           31              \n\t"

-        /*

-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

-          step_3 = dct_const_round_shift(temp2);

-        */

-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

-        "extp     %[step_3],            $ac1,           31              \n\t"

-        /*

-          output[0]  = step_0 + step_3;

-          output[4]  = step_1 + step_2;

-          output[8]  = step_1 - step_2;

-          output[12] = step_0 - step_3;

-        */

-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"

-        "sh       %[Temp0],             0(%[output])                    \n\t"

-        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"

-        "sh       %[Temp1],             8(%[output])                    \n\t"

-        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"

-        "sh       %[Temp2],             16(%[output])                   \n\t"

-        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"

-        "sh       %[Temp3],             24(%[output])                   \n\t"

-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),

-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),

-        [output] "+r" (output)

-      : [const_2_power_13] "r" (const_2_power_13),

-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),

-        [cospi_24_64] "r" (cospi_24_64),

-        [input] "r" (input)

-    );

-    input += 4;

-    output += 1;

-  }

-}

-static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,

-                                               int dest_stride) {

-  int16_t   step_0, step_1, step_2, step_3;

-  int       Temp0, Temp1, Temp2, Temp3;

-  const int const_2_power_13 = 8192;

-  int       i;

-  uint8_t   *dest_pix;

-  uint8_t   *cm = vpx_ff_cropTbl;

-  /* prefetch vpx_ff_cropTbl */

-  prefetch_load(vpx_ff_cropTbl);

-  prefetch_load(vpx_ff_cropTbl +  32);

-  prefetch_load(vpx_ff_cropTbl +  64);

-  prefetch_load(vpx_ff_cropTbl +  96);

-  prefetch_load(vpx_ff_cropTbl + 128);

-  prefetch_load(vpx_ff_cropTbl + 160);

-  prefetch_load(vpx_ff_cropTbl + 192);

-  prefetch_load(vpx_ff_cropTbl + 224);

-  for (i = 0; i < 4; ++i) {

-      dest_pix = (dest + i);

-    __asm__ __volatile__ (

-        /*

-          temp_1 = (input[0] + input[2]) * cospi_16_64;

-          step_0 = dct_const_round_shift(temp_1);

-          temp_2 = (input[0] - input[2]) * cospi_16_64;

-          step_1 = dct_const_round_shift(temp_2);

-        */

-        "lh       %[Temp0],             0(%[input])                     \n\t"

-        "lh       %[Temp1],             4(%[input])                     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

-        "lh       %[Temp0],             2(%[input])                     \n\t"

-        "lh       %[Temp1],             6(%[input])                     \n\t"

-        "extp     %[step_0],            $ac0,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

-        "extp     %[step_1],            $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        /*

-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

-          step_2 = dct_const_round_shift(temp1);

-        */

-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

-        "extp     %[step_2],            $ac0,           31              \n\t"

-        /*

-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

-          step_3 = dct_const_round_shift(temp2);

-        */

-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

-        "extp     %[step_3],            $ac1,           31              \n\t"

-        /*

-          output[0]  = step_0 + step_3;

-          output[4]  = step_1 + step_2;

-          output[8]  = step_1 - step_2;

-          output[12] = step_0 - step_3;

-        */

-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"

-        "addi     %[Temp0],             %[Temp0],       8               \n\t"

-        "sra      %[Temp0],             %[Temp0],       4               \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "addi     %[Temp0],             %[Temp0],       8               \n\t"

-        "sra      %[Temp0],             %[Temp0],       4               \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "addi     %[Temp0],             %[Temp0],       8               \n\t"

-        "sra      %[Temp0],             %[Temp0],       4               \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "addi     %[Temp0],             %[Temp0],       8               \n\t"

-        "sra      %[Temp0],             %[Temp0],       4               \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),

-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),

-        [dest_pix] "+r" (dest_pix)

-      : [const_2_power_13] "r" (const_2_power_13),

-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),

-        [cospi_24_64] "r" (cospi_24_64),

-        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)

-    );

-    input += 4;

-  }

-}

-void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,

-                              int dest_stride) {

-  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);

-  int16_t *outptr = out;

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // Rows

-  vp9_idct4_rows_dspr2(input, outptr);

-  // Columns

-  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);

-}

-void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,

-                             int dest_stride) {

-  int       a1, absa1;

-  int       r;

-  int32_t   out;

-  int       t2, vector_a1, vector_a;

-  uint32_t  pos = 45;

-  int16_t   input_dc = input[0];

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);

-  __asm__ __volatile__ (

-      "addi     %[out],     %[out],    8       \n\t"

-      "sra      %[a1],      %[out],    4       \n\t"

-      : [out] "+r" (out), [a1] "=r" (a1)

-      :

-  );

-  if (a1 < 0) {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "abs        %[absa1],     %[a1]         \n\t"

-        "replv.qb   %[vector_a1], %[absa1]      \n\t"

-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 4; r--;) {

-      __asm__ __volatile__ (

-          "lw             %[t2],          0(%[dest])                      \n\t"

-          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"

-          "sw             %[vector_a],    0(%[dest])                      \n\t"

-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"

-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),

-            [dest] "+&r" (dest)

-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

-      );

-    }

-  } else {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "replv.qb       %[vector_a1],   %[a1]     \n\t"

-        : [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 4; r--;) {

-      __asm__ __volatile__ (

-          "lw           %[t2],          0(%[dest])                        \n\t"

-          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"

-          "sw           %[vector_a],    0(%[dest])                        \n\t"

-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"

-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),

-            [dest] "+&r" (dest)

-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

-      );

-    }

-  }

-}

-static void iadst4_dspr2(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0, x1, x2, x3;

-  x0 = input[0];

-  x1 = input[1];

-  x2 = input[2];

-  x3 = input[3];

-  if (!(x0 | x1 | x2 | x3)) {

-    output[0] = output[1] = output[2] = output[3] = 0;

-    return;

-  }

-  s0 = sinpi_1_9 * x0;

-  s1 = sinpi_2_9 * x0;

-  s2 = sinpi_3_9 * x1;

-  s3 = sinpi_4_9 * x2;

-  s4 = sinpi_1_9 * x2;

-  s5 = sinpi_2_9 * x3;

-  s6 = sinpi_4_9 * x3;

-  s7 = x0 - x2 + x3;

-  x0 = s0 + s3 + s5;

-  x1 = s1 - s4 - s6;

-  x2 = sinpi_3_9 * s7;

-  x3 = s2;

-  s0 = x0 + x3;

-  s1 = x1 + x3;

-  s2 = x2;

-  s3 = x0 + x1 - x3;

-  // 1-D transform scaling factor is sqrt(2).

-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

-  // + 1b (addition) = 29b.

-  // Hence the output bit depth is 15b.

-  output[0] = dct_const_round_shift(s0);

-  output[1] = dct_const_round_shift(s1);

-  output[2] = dct_const_round_shift(s2);

-  output[3] = dct_const_round_shift(s3);

-}

 void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,

                              int dest_stride, int tx_type) {

   int i, j;

--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

@@ -15,538 +15,11 @@

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_blockd.h"

-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

 #include "vpx_dsp/txfm_common.h"

 #include "vpx_ports/mem.h"

 #if HAVE_DSPR2

-static void idct8_rows_dspr2(const int16_t *input, int16_t *output,

-                             uint32_t no_rows) {

-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

-  const int const_2_power_13 = 8192;

-  int Temp0, Temp1, Temp2, Temp3, Temp4;

-  int i;

-  for (i = no_rows; i--; ) {

-    __asm__ __volatile__ (

-        /*

-          temp_1 = (input[0] + input[4]) * cospi_16_64;

-          step2_0 = dct_const_round_shift(temp_1);

-          temp_2 = (input[0] - input[4]) * cospi_16_64;

-          step2_1 = dct_const_round_shift(temp_2);

-        */

-        "lh       %[Temp0],             0(%[input])                     \n\t"

-        "lh       %[Temp1],             8(%[input])                     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

-        "extp     %[Temp4],             $ac0,           31              \n\t"

-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "extp     %[Temp2],             $ac1,           31              \n\t"

-        /*

-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;

-          step2_2 = dct_const_round_shift(temp_1);

-        */

-        "lh       %[Temp0],             4(%[input])                     \n\t"

-        "lh       %[Temp1],             12(%[input])                    \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "extp     %[Temp3],             $ac0,           31              \n\t"

-        /*

-          step1_1 = step2_1 + step2_2;

-          step1_2 = step2_1 - step2_2;

-        */

-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"

-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"

-        /*

-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;

-          step2_3 = dct_const_round_shift(temp_2);

-        */

-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

-        "extp     %[Temp1],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        /*

-          step1_0 = step2_0 + step2_3;

-          step1_3 = step2_0 - step2_3;

-        */

-        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"

-        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"

-        /*

-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

-          step1_4 = dct_const_round_shift(temp_1);

-        */

-        "lh       %[Temp0],             2(%[input])                     \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "lh       %[Temp1],             14(%[input])                    \n\t"

-        "lh       %[Temp0],             2(%[input])                     \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"

-        "extp     %[step1_4],           $ac0,           31              \n\t"

-        /*

-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

-          step1_7 = dct_const_round_shift(temp_2);

-        */

-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"

-        "extp     %[step1_7],           $ac1,           31              \n\t"

-        /*

-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

-          step1_5 = dct_const_round_shift(temp_1);

-        */

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "lh       %[Temp0],             10(%[input])                    \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"

-        "lh       %[Temp1],             6(%[input])                     \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"

-        "extp     %[step1_5],           $ac0,           31              \n\t"

-        /*

-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

-          step1_6 = dct_const_round_shift(temp_2);

-        */

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "lh       %[Temp0],             10(%[input])                    \n\t"

-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"

-        "lh       %[Temp1],             6(%[input])                     \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"

-        "extp     %[step1_6],           $ac1,           31              \n\t"

-        /*

-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;

-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;

-        */

-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"

-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"

-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"

-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"

-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"

-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"

-        /*

-          step1_4 = step1_4 + step1_5;

-          step1_7 = step1_6 + step1_7;

-        */

-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"

-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"

-        "extp     %[step1_5],           $ac0,           31              \n\t"

-        "extp     %[step1_6],           $ac1,           31              \n\t"

-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"

-        "sh       %[Temp0],             0(%[output])                    \n\t"

-        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"

-        "sh       %[Temp1],             16(%[output])                   \n\t"

-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"

-        "sh       %[Temp0],             32(%[output])                   \n\t"

-        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"

-        "sh       %[Temp1],             48(%[output])                   \n\t"

-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"

-        "sh       %[Temp0],             64(%[output])                   \n\t"

-        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"

-        "sh       %[Temp1],             80(%[output])                   \n\t"

-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"

-        "sh       %[Temp0],             96(%[output])                   \n\t"

-        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"

-        "sh       %[Temp1],             112(%[output])                  \n\t"

-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),

-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),

-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),

-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),

-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

-          [Temp4] "=&r" (Temp4)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),

-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),

-          [cospi_24_64] "r" (cospi_24_64),

-          [output] "r" (output), [input] "r" (input)

-    );

-    input += 8;

-    output += 1;

-  }

-}

-static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,

-                                        int dest_stride) {

-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

-  int Temp0, Temp1, Temp2, Temp3;

-  int i;

-  const int const_2_power_13 = 8192;

-  uint8_t *dest_pix;

-  uint8_t *cm = vpx_ff_cropTbl;

-  /* prefetch vpx_ff_cropTbl */

-  prefetch_load(vpx_ff_cropTbl);

-  prefetch_load(vpx_ff_cropTbl +  32);

-  prefetch_load(vpx_ff_cropTbl +  64);

-  prefetch_load(vpx_ff_cropTbl +  96);

-  prefetch_load(vpx_ff_cropTbl + 128);

-  prefetch_load(vpx_ff_cropTbl + 160);

-  prefetch_load(vpx_ff_cropTbl + 192);

-  prefetch_load(vpx_ff_cropTbl + 224);

-  for (i = 0; i < 8; ++i) {

-      dest_pix = (dest + i);

-    __asm__ __volatile__ (

-        /*

-          temp_1 = (input[0] + input[4]) * cospi_16_64;

-          step2_0 = dct_const_round_shift(temp_1);

-          temp_2 = (input[0] - input[4]) * cospi_16_64;

-          step2_1 = dct_const_round_shift(temp_2);

-        */

-        "lh       %[Temp0],             0(%[input])                     \n\t"

-        "lh       %[Temp1],             8(%[input])                     \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

-        "extp     %[step1_6],           $ac0,           31              \n\t"

-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "extp     %[Temp2],             $ac1,           31              \n\t"

-        /*

-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;

-          step2_2 = dct_const_round_shift(temp_1);

-        */

-        "lh       %[Temp0],             4(%[input])                     \n\t"

-        "lh       %[Temp1],             12(%[input])                    \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "extp     %[Temp3],             $ac0,           31              \n\t"

-        /*

-          step1_1 = step2_1 + step2_2;

-          step1_2 = step2_1 - step2_2;

-        */

-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"

-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"

-        /*

-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;

-          step2_3 = dct_const_round_shift(temp_2);

-        */

-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

-        "extp     %[Temp1],             $ac1,           31              \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        /*

-          step1_0 = step2_0 + step2_3;

-          step1_3 = step2_0 - step2_3;

-        */

-        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"

-        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"

-        /*

-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

-          step1_4 = dct_const_round_shift(temp_1);

-        */

-        "lh       %[Temp0],             2(%[input])                     \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "lh       %[Temp1],             14(%[input])                    \n\t"

-        "lh       %[Temp0],             2(%[input])                     \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"

-        "extp     %[step1_4],           $ac0,           31              \n\t"

-        /*

-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

-          step1_7 = dct_const_round_shift(temp_2);

-        */

-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"

-        "extp     %[step1_7],           $ac1,           31              \n\t"

-        /*

-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

-          step1_5 = dct_const_round_shift(temp_1);

-        */

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "lh       %[Temp0],             10(%[input])                    \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"

-        "lh       %[Temp1],             6(%[input])                     \n\t"

-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"

-        "extp     %[step1_5],           $ac0,           31              \n\t"

-        /*

-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

-          step1_6 = dct_const_round_shift(temp_2);

-        */

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "lh       %[Temp0],             10(%[input])                    \n\t"

-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"

-        "lh       %[Temp1],             6(%[input])                     \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"

-        "extp     %[step1_6],           $ac1,           31              \n\t"

-        /*

-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;

-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;

-        */

-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"

-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"

-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"

-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"

-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"

-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"

-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

-        "mthi     $zero,                $ac0                            \n\t"

-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

-        "mthi     $zero,                $ac1                            \n\t"

-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"

-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"

-        /*

-          step1_4 = step1_4 + step1_5;

-          step1_7 = step1_6 + step1_7;

-        */

-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"

-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"

-        "extp     %[step1_5],           $ac0,           31              \n\t"

-        "extp     %[step1_6],           $ac1,           31              \n\t"

-        /* add block */

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

-        "addi     %[Temp0],             %[Temp0],       16              \n\t"

-        "sra      %[Temp0],             %[Temp0],       5               \n\t"

-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),

-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),

-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),

-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),

-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

-          [dest_pix] "+r" (dest_pix)

-        : [const_2_power_13] "r" (const_2_power_13),

-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),

-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),

-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),

-          [cospi_24_64] "r" (cospi_24_64),

-          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)

-    );

-    input += 8;

-  }

-}

-void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,

-                              int dest_stride) {

-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

-  int16_t *outptr = out;

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp    %[pos],    1    \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // First transform rows

-  idct8_rows_dspr2(input, outptr, 8);

-  // Then transform columns and add to dest

-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);

-}

-static void iadst8_dspr2(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0, x1, x2, x3, x4, x5, x6, x7;

-  x0 = input[7];

-  x1 = input[0];

-  x2 = input[5];

-  x3 = input[2];

-  x4 = input[3];

-  x5 = input[4];

-  x6 = input[1];

-  x7 = input[6];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

-    output[0] = output[1] = output[2] = output[3] = output[4]

-              = output[5] = output[6] = output[7] = 0;

-    return;

-  }

-  // stage 1

-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

-  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);

-  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);

-  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);

-  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);

-  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);

-  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);

-  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);

-  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

-  x0 = s0 + s2;

-  x1 = s1 + s3;

-  x2 = s0 - s2;

-  x3 = s1 - s3;

-  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);

-  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);

-  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);

-  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);

-  // stage 3

-  s2 = cospi_16_64 * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (x6 - x7);

-  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);

-  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);

-  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);

-  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);

-  output[0] =  x0;

-  output[1] = -x4;

-  output[2] =  x6;

-  output[3] = -x2;

-  output[4] =  x3;

-  output[5] = -x7;

-  output[6] =  x5;

-  output[7] = -x1;

-}

 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,

                              int dest_stride, int tx_type) {

   int i, j;

@@ -615,132 +88,6 @@

     default:

       printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");

       break;

-  }

-}

-void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,

-                              int dest_stride) {

-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

-  int16_t *outptr = out;

-  uint32_t pos = 45;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp    %[pos],    1    \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  // First transform rows

-  idct8_rows_dspr2(input, outptr, 4);

-  outptr += 4;

-  __asm__ __volatile__ (

-      "sw  $zero,   0(%[outptr])  \n\t"

-      "sw  $zero,   4(%[outptr])  \n\t"

-      "sw  $zero,  16(%[outptr])  \n\t"

-      "sw  $zero,  20(%[outptr])  \n\t"

-      "sw  $zero,  32(%[outptr])  \n\t"

-      "sw  $zero,  36(%[outptr])  \n\t"

-      "sw  $zero,  48(%[outptr])  \n\t"

-      "sw  $zero,  52(%[outptr])  \n\t"

-      "sw  $zero,  64(%[outptr])  \n\t"

-      "sw  $zero,  68(%[outptr])  \n\t"

-      "sw  $zero,  80(%[outptr])  \n\t"

-      "sw  $zero,  84(%[outptr])  \n\t"

-      "sw  $zero,  96(%[outptr])  \n\t"

-      "sw  $zero, 100(%[outptr])  \n\t"

-      "sw  $zero, 112(%[outptr])  \n\t"

-      "sw  $zero, 116(%[outptr])  \n\t"

-      :

-      : [outptr] "r" (outptr)

-  );

-  // Then transform columns and add to dest

-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);

-}

-void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,

-                             int dest_stride) {

-  uint32_t pos = 45;

-  int32_t out;

-  int32_t r;

-  int32_t a1, absa1;

-  int32_t t1, t2, vector_a1, vector_1, vector_2;

-  /* bit positon for extract from acc */

-  __asm__ __volatile__ (

-    "wrdsp      %[pos],     1           \n\t"

-    :

-    : [pos] "r" (pos)

-  );

-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

-  __asm__ __volatile__ (

-      "addi     %[out],     %[out],     16      \n\t"

-      "sra      %[a1],      %[out],     5       \n\t"

-      : [out] "+r" (out), [a1] "=r" (a1)

-      :

-  );

-  if (a1 < 0) {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "abs        %[absa1],       %[a1]       \n\t"

-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"

-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 8; r--;) {

-      __asm__ __volatile__ (

-          "lw           %[t1],          0(%[dest])                      \n\t"

-          "lw           %[t2],          4(%[dest])                      \n\t"

-          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "sw           %[vector_1],    0(%[dest])                      \n\t"

-          "sw           %[vector_2],    4(%[dest])                      \n\t"

-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"

-          : [t1] "=&r" (t1), [t2] "=&r" (t2),

-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

-            [dest] "+&r" (dest)

-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

-      );

-    }

-  } else {

-    /* use quad-byte

-     * input and output memory are four byte aligned */

-    __asm__ __volatile__ (

-        "replv.qb   %[vector_a1],   %[a1]   \n\t"

-        : [vector_a1] "=r" (vector_a1)

-        : [a1] "r" (a1)

-    );

-    for (r = 8; r--;) {

-      __asm__ __volatile__ (

-          "lw           %[t1],          0(%[dest])                      \n\t"

-          "lw           %[t2],          4(%[dest])                      \n\t"

-          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"

-          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"

-          "sw           %[vector_1],    0(%[dest])                      \n\t"

-          "sw           %[vector_2],    4(%[dest])                      \n\t"

-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"

-          : [t1] "=&r" (t1), [t2] "=&r" (t2),

-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

-            [dest] "+r" (dest)

-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

-      );

-    }

 #endif  // #if HAVE_DSPR2

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -71,15 +71,10 @@

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

 endif

-# common (c)

-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c

 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c

 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c

-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c

-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c

 endif

 # common (msa)

--- /dev/null

+++ b/vpx_dsp/mips/inv_txfm_dspr2.h

@@ -1,0 +1,73 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_

+#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_

+#include <assert.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_dsp/mips/common_dspr2.h"

+#ifdef __cplusplus

+extern "C" {

+#endif

+#if HAVE_DSPR2

+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \

+                                                                               \

+  int32_t tmp, out;                                                            \

+  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \

+  int     in = input;                                                          \

+                                                                               \

+  __asm__ __volatile__ (                                                       \

+      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \

+      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\

+      "mthi     $zero,                  $ac1                              \n\t"\

+      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\

+      "extp     %[tmp],                 $ac1,             31              \n\t"\

+                                                                               \

+      /* out = dct_const_round_shift(out * cospi_16_64); */                    \

+      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\

+      "mthi     $zero,                  $ac2                              \n\t"\

+      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\

+      "extp     %[out],                 $ac2,             31              \n\t"\

+                                                                               \

+      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \

+      : [in] "r" (in),                                                         \

+        [dct_cost_rounding] "r" (dct_cost_rounding),                           \

+        [cospi_16_64] "r" (cospi_16_64)                                        \

+   );                                                                          \

+  out;                                                                    })

+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                                   int dest_stride);

+void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output);

+void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                                     int dest_stride);

+void iadst4_dspr2(const int16_t *input, int16_t *output);

+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);

+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                                 int dest_stride);

+void iadst8_dspr2(const int16_t *input, int16_t *output);

+void idct16_rows_dspr2(const int16_t *input, int16_t *output,

+                       uint32_t no_rows);

+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                               int dest_stride);

+void iadst16_dspr2(const int16_t *input, int16_t *output);

+#endif  // #if HAVE_DSPR2

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+#endif  // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_

--- /dev/null

+++ b/vpx_dsp/mips/itrans16_dspr2.c

@@ -1,0 +1,1227 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

+#include "vpx_dsp/txfm_common.h"

+#if HAVE_DSPR2

+void idct16_rows_dspr2(const int16_t *input, int16_t *output,

+                       uint32_t no_rows) {

+  int i;

+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

+  int step1_10, step1_11, step1_12, step1_13;

+  int step2_0, step2_1, step2_2, step2_3;

+  int step2_8, step2_9, step2_10, step2_11;

+  int step2_12, step2_13, step2_14, step2_15;

+  int load1, load2, load3, load4, load5, load6, load7, load8;

+  int result1, result2, result3, result4;

+  const int const_2_power_13 = 8192;

+  for (i = no_rows; i--; ) {

+    /* prefetch row */

+    prefetch_load((const uint8_t *)(input + 16));

+    __asm__ __volatile__ (

+        "lh       %[load1],              0(%[input])                    \n\t"

+        "lh       %[load2],             16(%[input])                    \n\t"

+        "lh       %[load3],              8(%[input])                    \n\t"

+        "lh       %[load4],             24(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "add      %[result1],           %[load1],       %[load2]        \n\t"

+        "sub      %[result2],           %[load1],       %[load2]        \n\t"

+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

+        "extp     %[step2_0],           $ac1,           31              \n\t"

+        "extp     %[step2_1],           $ac2,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_2],           $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

+        "extp     %[step2_3],           $ac1,           31              \n\t"

+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"

+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"

+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"

+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),

+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),

+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load5],             2(%[input])                     \n\t"

+        "lh       %[load6],             30(%[input])                    \n\t"

+        "lh       %[load7],             18(%[input])                    \n\t"

+        "lh       %[load8],             14(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"

+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"

+        "extp     %[result1],           $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"

+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"

+        "extp     %[result2],           $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"

+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"

+        "extp     %[result3],           $ac1,           31              \n\t"

+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"

+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"

+        "extp     %[result4],           $ac2,           31              \n\t"

+        "sub      %[load5],             %[result1],     %[result2]      \n\t"

+        "sub      %[load6],             %[result4],     %[result3]      \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"

+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"

+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_9],           $ac1,           31              \n\t"

+        "extp     %[step2_14],          $ac3,           31              \n\t"

+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"

+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6),

+          [load7] "=&r" (load7), [load8] "=&r" (load8),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [result3] "=&r" (result3), [result4] "=&r" (result4),

+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),

+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             10(%[input])                    \n\t"

+        "lh       %[load2],             22(%[input])                    \n\t"

+        "lh       %[load3],             26(%[input])                    \n\t"

+        "lh       %[load4],             6(%[input])                     \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"

+        "extp     %[result1],           $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"

+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"

+        "extp     %[result2],           $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"

+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"

+        "extp     %[result3],           $ac1,           31              \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"

+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"

+        "extp     %[result4],           $ac2,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[result2],     %[result1]      \n\t"

+        "sub      %[load2],             %[result4],     %[result3]      \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_10],          $ac1,           31              \n\t"

+        "extp     %[step2_13],          $ac3,           31              \n\t"

+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"

+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [result3] "=&r" (result3), [result4] "=&r" (result4),

+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load5],             4(%[input])                     \n\t"

+        "lh       %[load6],             28(%[input])                    \n\t"

+        "lh       %[load7],             20(%[input])                    \n\t"

+        "lh       %[load8],             12(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"

+        "extp     %[result1],           $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"

+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"

+        "extp     %[result2],           $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"

+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"

+        "extp     %[result3],           $ac1,           31              \n\t"

+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"

+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"

+        "extp     %[result4],           $ac2,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load5],             %[result4],     %[result3]      \n\t"

+        "sub      %[load5],             %[load5],       %[result1]      \n\t"

+        "add      %[load5],             %[load5],       %[result2]      \n\t"

+        "sub      %[load6],             %[result1],     %[result2]      \n\t"

+        "sub      %[load6],             %[load6],       %[result3]      \n\t"

+        "add      %[load6],             %[load6],       %[result4]      \n\t"

+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"

+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_5],           $ac1,           31              \n\t"

+        "extp     %[step1_6],           $ac3,           31              \n\t"

+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"

+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6),

+          [load7] "=&r" (load7), [load8] "=&r" (load8),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [result3] "=&r" (result3), [result4] "=&r" (result4),

+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"

+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"

+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"

+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"

+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"

+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"

+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"

+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"

+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"

+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"

+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"

+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"

+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_10],          $ac0,           31              \n\t"

+        "extp     %[step1_13],          $ac1,           31              \n\t"

+        "extp     %[step1_11],          $ac2,           31              \n\t"

+        "extp     %[step1_12],          $ac3,           31              \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6),

+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),

+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),

+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),

+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    __asm__ __volatile__ (

+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"

+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"

+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"

+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"

+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"

+        "sh       %[load5],             0(%[output])                    \n\t"

+        "sh       %[load6],             32(%[output])                   \n\t"

+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"

+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"

+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"

+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"

+        "sh       %[load5],             192(%[output])                  \n\t"

+        "sh       %[load6],             224(%[output])                  \n\t"

+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"

+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"

+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"

+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"

+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"

+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"

+        "sh       %[load5],             256(%[output])                  \n\t"

+        "sh       %[load6],             288(%[output])                  \n\t"

+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"

+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"

+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"

+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"

+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"

+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"

+        "sh       %[load5],             448(%[output])                  \n\t"

+        "sh       %[load6],             480(%[output])                  \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6)

+        : [output] "r" (output),

+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)

+    );

+    __asm__ __volatile__ (

+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"

+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"

+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"

+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"

+        "sh       %[load5],             64(%[output])                   \n\t"

+        "sh       %[load6],             96(%[output])                   \n\t"

+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"

+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"

+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"

+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"

+        "sh       %[load5],             128(%[output])                  \n\t"

+        "sh       %[load6],             160(%[output])                  \n\t"

+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"

+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"

+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"

+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"

+        "sh       %[load5],             320(%[output])                  \n\t"

+        "sh       %[load6],             352(%[output])                  \n\t"

+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"

+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"

+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"

+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"

+        "sh       %[load5],             384(%[output])                  \n\t"

+        "sh       %[load6],             416(%[output])                  \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6)

+        : [output] "r" (output),

+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)

+    );

+    input += 16;

+    output += 1;

+  }

+}

+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                               int dest_stride) {

+  int i;

+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

+  int step1_8, step1_9, step1_10, step1_11;

+  int step1_12, step1_13, step1_14, step1_15;

+  int step2_0, step2_1, step2_2, step2_3;

+  int step2_8, step2_9, step2_10, step2_11;

+  int step2_12, step2_13, step2_14, step2_15;

+  int load1, load2, load3, load4, load5, load6, load7, load8;

+  int result1, result2, result3, result4;

+  const int const_2_power_13 = 8192;

+  uint8_t *dest_pix;

+  uint8_t *cm = vpx_ff_cropTbl;

+  /* prefetch vpx_ff_cropTbl */

+  prefetch_load(vpx_ff_cropTbl);

+  prefetch_load(vpx_ff_cropTbl +  32);

+  prefetch_load(vpx_ff_cropTbl +  64);

+  prefetch_load(vpx_ff_cropTbl +  96);

+  prefetch_load(vpx_ff_cropTbl + 128);

+  prefetch_load(vpx_ff_cropTbl + 160);

+  prefetch_load(vpx_ff_cropTbl + 192);

+  prefetch_load(vpx_ff_cropTbl + 224);

+  for (i = 0; i < 16; ++i) {

+    dest_pix = (dest + i);

+    __asm__ __volatile__ (

+        "lh       %[load1],              0(%[input])                    \n\t"

+        "lh       %[load2],             16(%[input])                    \n\t"

+        "lh       %[load3],              8(%[input])                    \n\t"

+        "lh       %[load4],             24(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "add      %[result1],           %[load1],       %[load2]        \n\t"

+        "sub      %[result2],           %[load1],       %[load2]        \n\t"

+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

+        "extp     %[step2_0],           $ac1,           31              \n\t"

+        "extp     %[step2_1],           $ac2,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_2],           $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

+        "extp     %[step2_3],           $ac1,           31              \n\t"

+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"

+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"

+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"

+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),

+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),

+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load5],             2(%[input])                     \n\t"

+        "lh       %[load6],             30(%[input])                    \n\t"

+        "lh       %[load7],             18(%[input])                    \n\t"

+        "lh       %[load8],             14(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"

+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"

+        "extp     %[result1],           $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"

+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"

+        "extp     %[result2],           $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"

+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"

+        "extp     %[result3],           $ac1,           31              \n\t"

+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"

+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"

+        "extp     %[result4],           $ac2,            31             \n\t"

+        "sub      %[load5],             %[result1],     %[result2]      \n\t"

+        "sub      %[load6],             %[result4],     %[result3]      \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"

+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"

+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_9],           $ac1,           31              \n\t"

+        "extp     %[step2_14],          $ac3,           31              \n\t"

+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"

+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6),

+          [load7] "=&r" (load7), [load8] "=&r" (load8),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [result3] "=&r" (result3), [result4] "=&r" (result4),

+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),

+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             10(%[input])                    \n\t"

+        "lh       %[load2],             22(%[input])                    \n\t"

+        "lh       %[load3],             26(%[input])                    \n\t"

+        "lh       %[load4],             6(%[input])                     \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"

+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"

+        "extp     %[result1],           $ac1,        31                 \n\t"

+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"

+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"

+        "extp     %[result2],           $ac3,        31                 \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"

+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"

+        "extp     %[result3],           $ac1,        31                 \n\t"

+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"

+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"

+        "extp     %[result4],           $ac2,        31                 \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[result2],     %[result1]      \n\t"

+        "sub      %[load2],             %[result4],     %[result3]      \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_10],          $ac1,           31              \n\t"

+        "extp     %[step2_13],          $ac3,           31              \n\t"

+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"

+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [result3] "=&r" (result3), [result4] "=&r" (result4),

+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load5],             4(%[input])                   \n\t"

+        "lh       %[load6],             28(%[input])                  \n\t"

+        "lh       %[load7],             20(%[input])                  \n\t"

+        "lh       %[load8],             12(%[input])                  \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"

+        "mthi     $zero,                $ac1                          \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"

+        "mthi     $zero,                $ac3                          \n\t"

+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"

+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"

+        "extp     %[result1],           $ac1,        31               \n\t"

+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"

+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"

+        "extp     %[result2],           $ac3,        31               \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"

+        "mthi     $zero,                $ac1                          \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"

+        "mthi     $zero,                $ac2                          \n\t"

+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"

+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"

+        "extp     %[result3],           $ac1,        31               \n\t"

+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"

+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"

+        "extp     %[result4],           $ac2,        31               \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load5],             %[result4],     %[result3]      \n\t"

+        "sub      %[load5],             %[load5],       %[result1]      \n\t"

+        "add      %[load5],             %[load5],       %[result2]      \n\t"

+        "sub      %[load6],             %[result1],     %[result2]      \n\t"

+        "sub      %[load6],             %[load6],       %[result3]      \n\t"

+        "add      %[load6],             %[load6],       %[result4]      \n\t"

+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"

+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_5],           $ac1,           31              \n\t"

+        "extp     %[step1_6],           $ac3,           31              \n\t"

+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"

+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6),

+          [load7] "=&r" (load7), [load8] "=&r" (load8),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [result3] "=&r" (result3), [result4] "=&r" (result4),

+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"

+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"

+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"

+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"

+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"

+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"

+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"

+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"

+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"

+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"

+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"

+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"

+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"

+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_10],          $ac0,           31              \n\t"

+        "extp     %[step1_13],          $ac1,           31              \n\t"

+        "extp     %[step1_11],          $ac2,           31              \n\t"

+        "extp     %[step1_12],          $ac3,           31              \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6),

+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),

+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),

+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),

+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    step1_8 = step2_8 + step2_11;

+    step1_9 = step2_9 + step2_10;

+    step1_14 = step2_13 + step2_14;

+    step1_15 = step2_12 + step2_15;

+    __asm__ __volatile__ (

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"

+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"

+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"

+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"

+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"

+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"

+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"

+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"

+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"

+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"

+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"

+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"

+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"

+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"

+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"

+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"

+        "addi     %[load5],         %[load5],           32              \n\t"

+        "sra      %[load5],         %[load5],           6               \n\t"

+        "add      %[load7],         %[load7],           %[load5]        \n\t"

+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"

+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"

+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"

+        "sb       %[load5],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"

+        "addi     %[load6],         %[load6],           32              \n\t"

+        "sra      %[load6],         %[load6],           6               \n\t"

+        "add      %[load8],         %[load8],           %[load6]        \n\t"

+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"

+        "sb       %[load6],         0(%[dest_pix])                      \n\t"

+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),

+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)

+    );

+    input += 16;

+  }

+}

+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,

+                                 int dest_stride) {

+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp    %[pos],    1    \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // First transform rows

+  idct16_rows_dspr2(input, out, 16);

+  // Then transform columns and add to dest

+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);

+}

+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,

+                                int dest_stride) {

+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);

+  int16_t *outptr = out;

+  uint32_t i;

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp    %[pos],    1    \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // First transform rows. Since all non-zero dct coefficients are in

+  // upper-left 4x4 area, we only need to calculate first 4 rows here.

+  idct16_rows_dspr2(input, outptr, 4);

+  outptr += 4;

+  for (i = 0; i < 6; ++i) {

+    __asm__ __volatile__ (

+        "sw     $zero,    0(%[outptr])     \n\t"

+        "sw     $zero,   32(%[outptr])     \n\t"

+        "sw     $zero,   64(%[outptr])     \n\t"

+        "sw     $zero,   96(%[outptr])     \n\t"

+        "sw     $zero,  128(%[outptr])     \n\t"

+        "sw     $zero,  160(%[outptr])     \n\t"

+        "sw     $zero,  192(%[outptr])     \n\t"

+        "sw     $zero,  224(%[outptr])     \n\t"

+        "sw     $zero,  256(%[outptr])     \n\t"

+        "sw     $zero,  288(%[outptr])     \n\t"

+        "sw     $zero,  320(%[outptr])     \n\t"

+        "sw     $zero,  352(%[outptr])     \n\t"

+        "sw     $zero,  384(%[outptr])     \n\t"

+        "sw     $zero,  416(%[outptr])     \n\t"

+        "sw     $zero,  448(%[outptr])     \n\t"

+        "sw     $zero,  480(%[outptr])     \n\t"

+        :

+        : [outptr] "r" (outptr)

+    );

+    outptr += 2;

+  }

+  // Then transform columns

+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);

+}

+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,

+                               int dest_stride) {

+  uint32_t pos = 45;

+  int32_t out;

+  int32_t r;

+  int32_t a1, absa1;

+  int32_t vector_a1;

+  int32_t t1, t2, t3, t4;

+  int32_t vector_1, vector_2, vector_3, vector_4;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

+  __asm__ __volatile__ (

+      "addi     %[out],     %[out],     32      \n\t"

+      "sra      %[a1],      %[out],     6       \n\t"

+      : [out] "+r" (out), [a1] "=r" (a1)

+      :

+  );

+  if (a1 < 0) {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "abs        %[absa1],       %[a1]       \n\t"

+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"

+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 16; r--;) {

+      __asm__ __volatile__ (

+          "lw             %[t1],          0(%[dest])                      \n\t"

+          "lw             %[t2],          4(%[dest])                      \n\t"

+          "lw             %[t3],          8(%[dest])                      \n\t"

+          "lw             %[t4],          12(%[dest])                     \n\t"

+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

+          "sw             %[vector_1],    0(%[dest])                      \n\t"

+          "sw             %[vector_2],    4(%[dest])                      \n\t"

+          "sw             %[vector_3],    8(%[dest])                      \n\t"

+          "sw             %[vector_4],    12(%[dest])                     \n\t"

+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"

+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

+            [dest] "+&r" (dest)

+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  } else {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "replv.qb   %[vector_a1],   %[a1]   \n\t"

+        : [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 16; r--;) {

+      __asm__ __volatile__ (

+          "lw             %[t1],          0(%[dest])                      \n\t"

+          "lw             %[t2],          4(%[dest])                      \n\t"

+          "lw             %[t3],          8(%[dest])                      \n\t"

+          "lw             %[t4],          12(%[dest])                     \n\t"

+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

+          "sw             %[vector_1],    0(%[dest])                      \n\t"

+          "sw             %[vector_2],    4(%[dest])                      \n\t"

+          "sw             %[vector_3],    8(%[dest])                      \n\t"

+          "sw             %[vector_4],    12(%[dest])                     \n\t"

+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"

+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

+            [dest] "+&r" (dest)

+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  }

+}

+void iadst16_dspr2(const int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

+  int x0 = input[15];

+  int x1 = input[0];

+  int x2 = input[13];

+  int x3 = input[2];

+  int x4 = input[11];

+  int x5 = input[4];

+  int x6 = input[9];

+  int x7 = input[6];

+  int x8 = input[7];

+  int x9 = input[8];

+  int x10 = input[5];

+  int x11 = input[10];

+  int x12 = input[3];

+  int x13 = input[12];

+  int x14 = input[1];

+  int x15 = input[14];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = output[8]

+              = output[9] = output[10] = output[11] = output[12]

+              = output[13] = output[14] = output[15] = 0;

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = dct_const_round_shift(s0 + s8);

+  x1 = dct_const_round_shift(s1 + s9);

+  x2 = dct_const_round_shift(s2 + s10);

+  x3 = dct_const_round_shift(s3 + s11);

+  x4 = dct_const_round_shift(s4 + s12);

+  x5 = dct_const_round_shift(s5 + s13);

+  x6 = dct_const_round_shift(s6 + s14);

+  x7 = dct_const_round_shift(s7 + s15);

+  x8  = dct_const_round_shift(s0 - s8);

+  x9  = dct_const_round_shift(s1 - s9);

+  x10 = dct_const_round_shift(s2 - s10);

+  x11 = dct_const_round_shift(s3 - s11);

+  x12 = dct_const_round_shift(s4 - s12);

+  x13 = dct_const_round_shift(s5 - s13);

+  x14 = dct_const_round_shift(s6 - s14);

+  x15 = dct_const_round_shift(s7 - s15);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = s0 + s4;

+  x1 = s1 + s5;

+  x2 = s2 + s6;

+  x3 = s3 + s7;

+  x4 = s0 - s4;

+  x5 = s1 - s5;

+  x6 = s2 - s6;

+  x7 = s3 - s7;

+  x8 = dct_const_round_shift(s8 + s12);

+  x9 = dct_const_round_shift(s9 + s13);

+  x10 = dct_const_round_shift(s10 + s14);

+  x11 = dct_const_round_shift(s11 + s15);

+  x12 = dct_const_round_shift(s8 - s12);

+  x13 = dct_const_round_shift(s9 - s13);

+  x14 = dct_const_round_shift(s10 - s14);

+  x15 = dct_const_round_shift(s11 - s15);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  x8 = s8 + s10;

+  x9 = s9 + s11;

+  x10 = s8 - s10;

+  x11 = s9 - s11;

+  x12 = dct_const_round_shift(s12 + s14);

+  x13 = dct_const_round_shift(s13 + s15);

+  x14 = dct_const_round_shift(s12 - s14);

+  x15 = dct_const_round_shift(s13 - s15);

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (- x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (- x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  x10 = dct_const_round_shift(s10);

+  x11 = dct_const_round_shift(s11);

+  x14 = dct_const_round_shift(s14);

+  x15 = dct_const_round_shift(s15);

+  output[0] =  x0;

+  output[1] = -x8;

+  output[2] =  x12;

+  output[3] = -x4;

+  output[4] =  x6;

+  output[5] =  x14;

+  output[6] =  x10;

+  output[7] =  x2;

+  output[8] =  x3;

+  output[9] =  x11;

+  output[10] =  x15;

+  output[11] =  x7;

+  output[12] =  x5;

+  output[13] = -x13;

+  output[14] =  x9;

+  output[15] = -x1;

+}

+#endif  // HAVE_DSPR2

--- /dev/null

+++ b/vpx_dsp/mips/itrans32_cols_dspr2.c

@@ -1,0 +1,1068 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

+#include "vpx_dsp/txfm_common.h"

+#if HAVE_DSPR2

+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                                   int dest_stride) {

+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;

+  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;

+  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;

+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

+  int16_t step2_28, step2_29, step2_30, step2_31;

+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;

+  int16_t step3_28, step3_29, step3_30, step3_31;

+  int temp0, temp1, temp2, temp3;

+  int load1, load2, load3, load4;

+  int result1, result2;

+  int i, temp21;

+  uint8_t *dest_pix, *dest_pix1;

+  const int const_2_power_13 = 8192;

+  uint8_t *cm = vpx_ff_cropTbl;

+  /* prefetch vpx_ff_cropTbl */

+  prefetch_load(vpx_ff_cropTbl);

+  prefetch_load(vpx_ff_cropTbl +  32);

+  prefetch_load(vpx_ff_cropTbl +  64);

+  prefetch_load(vpx_ff_cropTbl +  96);

+  prefetch_load(vpx_ff_cropTbl + 128);

+  prefetch_load(vpx_ff_cropTbl + 160);

+  prefetch_load(vpx_ff_cropTbl + 192);

+  prefetch_load(vpx_ff_cropTbl + 224);

+  for (i = 0; i < 32; ++i) {

+    dest_pix = dest + i;

+    dest_pix1 = dest + i + 31 * dest_stride;

+    __asm__ __volatile__ (

+        "lh       %[load1],             2(%[input])                     \n\t"

+        "lh       %[load2],             62(%[input])                    \n\t"

+        "lh       %[load3],             34(%[input])                    \n\t"

+        "lh       %[load4],             30(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

+        "extp     %[step1_17],          $ac1,           31              \n\t"

+        "extp     %[step1_30],          $ac3,           31              \n\t"

+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             18(%[input])                    \n\t"

+        "lh       %[load2],             46(%[input])                    \n\t"

+        "lh       %[load3],             50(%[input])                    \n\t"

+        "lh       %[load4],             14(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

+        "extp     %[step1_18],          $ac1,           31              \n\t"

+        "extp     %[step1_29],          $ac3,           31              \n\t"

+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             10(%[input])                    \n\t"

+        "lh       %[load2],             54(%[input])                    \n\t"

+        "lh       %[load3],             42(%[input])                    \n\t"

+        "lh       %[load4],             22(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"

+        "extp     %[step1_21],          $ac1,           31              \n\t"

+        "extp     %[step1_26],          $ac3,           31              \n\t"

+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             26(%[input])                    \n\t"

+        "lh       %[load2],             38(%[input])                    \n\t"

+        "lh       %[load3],             58(%[input])                    \n\t"

+        "lh       %[load4],              6(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"

+        "extp     %[step1_22],          $ac1,           31              \n\t"

+        "extp     %[step1_25],          $ac3,           31              \n\t"

+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],              4(%[input])                    \n\t"

+        "lh       %[load2],             60(%[input])                    \n\t"

+        "lh       %[load3],             36(%[input])                    \n\t"

+        "lh       %[load4],             28(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_9],           $ac1,           31              \n\t"

+        "extp     %[step2_14],          $ac3,           31              \n\t"

+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"

+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             20(%[input])                    \n\t"

+        "lh       %[load2],             44(%[input])                    \n\t"

+        "lh       %[load3],             52(%[input])                    \n\t"

+        "lh       %[load4],             12(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_10],          $ac1,           31              \n\t"

+        "extp     %[step2_13],          $ac3,           31              \n\t"

+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

+    );

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"

+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"

+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"

+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"

+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"

+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"

+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"

+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"

+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"

+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"

+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"

+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"

+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"

+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"

+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"

+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"

+        "extp     %[step3_10],          $ac0,           31              \n\t"

+        "extp     %[step3_13],          $ac1,           31              \n\t"

+        "extp     %[step3_11],          $ac2,           31              \n\t"

+        "extp     %[step3_12],          $ac3,           31              \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

+        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),

+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

+          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),

+          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),

+          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)

+    );

+    step2_18 = step1_17 - step1_18;

+    step2_29 = step1_30 - step1_29;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"

+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"

+        "extp     %[step3_18],          $ac0,           31              \n\t"

+        : [step3_18] "=r" (step3_18)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step2_19 = step1_16 - step1_19;

+    step2_28 = step1_31 - step1_28;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"

+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"

+        "extp     %[step3_19],          $ac0,           31              \n\t"

+        : [step3_19] "=r" (step3_19)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step3_16 = step1_16 + step1_19;

+    step3_17 = step1_17 + step1_18;

+    step3_30 = step1_29 + step1_30;

+    step3_31 = step1_28 + step1_31;

+    step2_20 = step1_23 - step1_20;

+    step2_27 = step1_24 - step1_27;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"

+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"

+        "extp     %[step3_20],          $ac0,           31              \n\t"

+        : [step3_20] "=r" (step3_20)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step2_21 = step1_22 - step1_21;

+    step2_26 = step1_25 - step1_26;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"

+        "extp     %[step3_21],          $ac1,           31              \n\t"

+        : [step3_21] "=r" (step3_21)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step3_22 = step1_21 + step1_22;

+    step3_23 = step1_20 + step1_23;

+    step3_24 = step1_24 + step1_27;

+    step3_25 = step1_25 + step1_26;

+    step2_16 = step3_16 + step3_23;

+    step2_17 = step3_17 + step3_22;

+    step2_18 = step3_18 + step3_21;

+    step2_19 = step3_19 + step3_20;

+    step2_20 = step3_19 - step3_20;

+    step2_21 = step3_18 - step3_21;

+    step2_22 = step3_17 - step3_22;

+    step2_23 = step3_16 - step3_23;

+    step2_24 = step3_31 - step3_24;

+    step2_25 = step3_30 - step3_25;

+    step2_26 = step3_29 - step3_26;

+    step2_27 = step3_28 - step3_27;

+    step2_28 = step3_28 + step3_27;

+    step2_29 = step3_29 + step3_26;

+    step2_30 = step3_30 + step3_25;

+    step2_31 = step3_31 + step3_24;

+    __asm__ __volatile__ (

+        "lh       %[load1],             0(%[input])                     \n\t"

+        "lh       %[load2],             32(%[input])                    \n\t"

+        "lh       %[load3],             16(%[input])                    \n\t"

+        "lh       %[load4],             48(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "add      %[result1],           %[load1],       %[load2]        \n\t"

+        "sub      %[result2],           %[load1],       %[load2]        \n\t"

+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

+        "extp     %[temp2],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

+        "extp     %[temp3],             $ac1,           31              \n\t"

+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"

+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"

+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"

+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             8(%[input])                     \n\t"

+        "lh       %[load2],             56(%[input])                    \n\t"

+        "lh       %[load3],             40(%[input])                    \n\t"

+        "lh       %[load4],             24(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"

+        "add      %[load1],             %[load1],       %[temp1]        \n\t"

+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"

+        "add      %[load2],             %[load2],       %[temp3]        \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_5],           $ac1,           31              \n\t"

+        "extp     %[step1_6],           $ac3,           31              \n\t"

+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    step2_0 = step1_0 + step1_7;

+    step2_1 = step1_1 + step1_6;

+    step2_2 = step1_2 + step1_5;

+    step2_3 = step1_3 + step1_4;

+    step2_4 = step1_3 - step1_4;

+    step2_5 = step1_2 - step1_5;

+    step2_6 = step1_1 - step1_6;

+    step2_7 = step1_0 - step1_7;

+    // stage 7

+    step1_0 = step2_0 + step3_15;

+    step1_1 = step2_1 + step3_14;

+    step1_2 = step2_2 + step3_13;

+    step1_3 = step2_3 + step3_12;

+    step1_4 = step2_4 + step3_11;

+    step1_5 = step2_5 + step3_10;

+    step1_6 = step2_6 + step3_9;

+    step1_7 = step2_7 + step3_8;

+    step1_8 = step2_7 - step3_8;

+    step1_9 = step2_6 - step3_9;

+    step1_10 = step2_5 - step3_10;

+    step1_11 = step2_4 - step3_11;

+    step1_12 = step2_3 - step3_12;

+    step1_13 = step2_2 - step3_13;

+    step1_14 = step2_1 - step3_14;

+    step1_15 = step2_0 - step3_15;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_20],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

+        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),

+          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_20 + step2_27) * cospi_16_64;

+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_21],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

+        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),

+          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_21 + step2_26) * cospi_16_64;

+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_22],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

+        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),

+          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_22 + step2_25) * cospi_16_64;

+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_23],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

+        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),

+          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_23 + step2_24) * cospi_16_64;

+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

+          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),

+          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)

+    );

+    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);

+    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);

+    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);

+    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

+    );

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

+          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),

+          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)

+    );

+    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);

+    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);

+    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);

+    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

+    );

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

+          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),

+          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)

+    );

+    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);

+    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);

+    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);

+    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

+    );

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"

+        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"

+        "addi     %[temp0],         %[temp0],           32              \n\t"

+        "sra      %[temp0],         %[temp0],           6               \n\t"

+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"

+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"

+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"

+        "addi     %[temp1],         %[temp1],           32              \n\t"

+        "sra      %[temp1],         %[temp1],           6               \n\t"

+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),

+          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),

+          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)

+    );

+    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);

+    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);

+    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);

+    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);

+    __asm__ __volatile__ (

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"

+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"

+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"

+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"

+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"

+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"

+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"

+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

+    );

+    input += 32;

+  }

+}

+#endif  // #if HAVE_DSPR2

--- /dev/null

+++ b/vpx_dsp/mips/itrans32_dspr2.c

@@ -1,0 +1,1073 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <stdio.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

+#include "vpx_dsp/txfm_common.h"

+#if HAVE_DSPR2

+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,

+                              uint32_t no_rows) {

+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;

+  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;

+  int16_t step1_28, step1_29, step1_30, step1_31;

+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

+  int16_t step2_28, step2_29, step2_30, step2_31;

+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;

+  int16_t step3_29, step3_30, step3_31;

+  int temp0, temp1, temp2, temp3;

+  int load1, load2, load3, load4;

+  int result1, result2;

+  int temp21;

+  int i;

+  const int const_2_power_13 = 8192;

+  const int32_t *input_int;

+  for (i = no_rows; i--; ) {

+    input_int = (const int32_t *)input;

+    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |

+          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |

+          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |

+          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {

+      input += 32;

+      __asm__ __volatile__ (

+          "sh     $zero,     0(%[output])     \n\t"

+          "sh     $zero,    64(%[output])     \n\t"

+          "sh     $zero,   128(%[output])     \n\t"

+          "sh     $zero,   192(%[output])     \n\t"

+          "sh     $zero,   256(%[output])     \n\t"

+          "sh     $zero,   320(%[output])     \n\t"

+          "sh     $zero,   384(%[output])     \n\t"

+          "sh     $zero,   448(%[output])     \n\t"

+          "sh     $zero,   512(%[output])     \n\t"

+          "sh     $zero,   576(%[output])     \n\t"

+          "sh     $zero,   640(%[output])     \n\t"

+          "sh     $zero,   704(%[output])     \n\t"

+          "sh     $zero,   768(%[output])     \n\t"

+          "sh     $zero,   832(%[output])     \n\t"

+          "sh     $zero,   896(%[output])     \n\t"

+          "sh     $zero,   960(%[output])     \n\t"

+          "sh     $zero,  1024(%[output])     \n\t"

+          "sh     $zero,  1088(%[output])     \n\t"

+          "sh     $zero,  1152(%[output])     \n\t"

+          "sh     $zero,  1216(%[output])     \n\t"

+          "sh     $zero,  1280(%[output])     \n\t"

+          "sh     $zero,  1344(%[output])     \n\t"

+          "sh     $zero,  1408(%[output])     \n\t"

+          "sh     $zero,  1472(%[output])     \n\t"

+          "sh     $zero,  1536(%[output])     \n\t"

+          "sh     $zero,  1600(%[output])     \n\t"

+          "sh     $zero,  1664(%[output])     \n\t"

+          "sh     $zero,  1728(%[output])     \n\t"

+          "sh     $zero,  1792(%[output])     \n\t"

+          "sh     $zero,  1856(%[output])     \n\t"

+          "sh     $zero,  1920(%[output])     \n\t"

+          "sh     $zero,  1984(%[output])     \n\t"

+          :

+          : [output] "r" (output)

+      );

+      output += 1;

+      continue;

+    }

+    /* prefetch row */

+    prefetch_load((const uint8_t *)(input + 32));

+    prefetch_load((const uint8_t *)(input + 48));

+    __asm__ __volatile__ (

+        "lh       %[load1],             2(%[input])                     \n\t"

+        "lh       %[load2],             62(%[input])                    \n\t"

+        "lh       %[load3],             34(%[input])                    \n\t"

+        "lh       %[load4],             30(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

+        "extp     %[step1_17],          $ac1,           31              \n\t"

+        "extp     %[step1_30],          $ac3,           31              \n\t"

+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             18(%[input])                    \n\t"

+        "lh       %[load2],             46(%[input])                    \n\t"

+        "lh       %[load3],             50(%[input])                    \n\t"

+        "lh       %[load4],             14(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

+        "extp     %[step1_18],          $ac1,           31              \n\t"

+        "extp     %[step1_29],          $ac3,           31              \n\t"

+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             10(%[input])                    \n\t"

+        "lh       %[load2],             54(%[input])                    \n\t"

+        "lh       %[load3],             42(%[input])                    \n\t"

+        "lh       %[load4],             22(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"

+        "extp     %[step1_21],          $ac1,           31              \n\t"

+        "extp     %[step1_26],          $ac3,           31              \n\t"

+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             26(%[input])                    \n\t"

+        "lh       %[load2],             38(%[input])                    \n\t"

+        "lh       %[load3],             58(%[input])                    \n\t"

+        "lh       %[load4],              6(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"

+        "extp     %[step1_22],          $ac1,           31              \n\t"

+        "extp     %[step1_25],          $ac3,           31              \n\t"

+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],              4(%[input])                    \n\t"

+        "lh       %[load2],             60(%[input])                    \n\t"

+        "lh       %[load3],             36(%[input])                    \n\t"

+        "lh       %[load4],             28(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"

+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_9],           $ac1,           31              \n\t"

+        "extp     %[step2_14],          $ac3,           31              \n\t"

+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"

+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             20(%[input])                    \n\t"

+        "lh       %[load2],             44(%[input])                    \n\t"

+        "lh       %[load3],             52(%[input])                    \n\t"

+        "lh       %[load4],             12(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"

+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"

+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"

+        "extp     %[step2_10],          $ac1,           31              \n\t"

+        "extp     %[step2_13],          $ac3,           31              \n\t"

+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"

+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

+    );

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"

+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"

+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"

+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"

+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"

+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"

+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"

+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"

+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"

+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"

+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"

+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"

+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"

+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"

+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"

+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"

+        "extp     %[step3_10],          $ac0,           31              \n\t"

+        "extp     %[step3_13],          $ac1,           31              \n\t"

+        "extp     %[step3_11],          $ac2,           31              \n\t"

+        "extp     %[step3_12],          $ac3,           31              \n\t"

+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    step2_18 = step1_17 - step1_18;

+    step2_29 = step1_30 - step1_29;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"

+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"

+        "extp     %[step3_18],          $ac0,           31              \n\t"

+        : [step3_18] "=r" (step3_18)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step2_19 = step1_16 - step1_19;

+    step2_28 = step1_31 - step1_28;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"

+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"

+        "extp     %[step3_19],          $ac0,           31              \n\t"

+        : [step3_19] "=r" (step3_19)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step3_16 = step1_16 + step1_19;

+    step3_17 = step1_17 + step1_18;

+    step3_30 = step1_29 + step1_30;

+    step3_31 = step1_28 + step1_31;

+    step2_20 = step1_23 - step1_20;

+    step2_27 = step1_24 - step1_27;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"

+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"

+        "extp     %[step3_20],          $ac0,           31              \n\t"

+        : [step3_20] "=r" (step3_20)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step2_21 = step1_22 - step1_21;

+    step2_26 = step1_25 - step1_26;

+    __asm__ __volatile__ (

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"

+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"

+        "extp     %[step3_21],          $ac1,           31              \n\t"

+        : [step3_21] "=r" (step3_21)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    step3_22 = step1_21 + step1_22;

+    step3_23 = step1_20 + step1_23;

+    step3_24 = step1_24 + step1_27;

+    step3_25 = step1_25 + step1_26;

+    step2_16 = step3_16 + step3_23;

+    step2_17 = step3_17 + step3_22;

+    step2_18 = step3_18 + step3_21;

+    step2_19 = step3_19 + step3_20;

+    step2_20 = step3_19 - step3_20;

+    step2_21 = step3_18 - step3_21;

+    step2_22 = step3_17 - step3_22;

+    step2_23 = step3_16 - step3_23;

+    step2_24 = step3_31 - step3_24;

+    step2_25 = step3_30 - step3_25;

+    step2_26 = step3_29 - step3_26;

+    step2_27 = step3_28 - step3_27;

+    step2_28 = step3_28 + step3_27;

+    step2_29 = step3_29 + step3_26;

+    step2_30 = step3_30 + step3_25;

+    step2_31 = step3_31 + step3_24;

+    __asm__ __volatile__ (

+        "lh       %[load1],             0(%[input])                     \n\t"

+        "lh       %[load2],             32(%[input])                    \n\t"

+        "lh       %[load3],             16(%[input])                    \n\t"

+        "lh       %[load4],             48(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "add      %[result1],           %[load1],       %[load2]        \n\t"

+        "sub      %[result2],           %[load1],       %[load2]        \n\t"

+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"

+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"

+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"

+        "extp     %[temp2],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"

+        "extp     %[temp3],             $ac1,           31              \n\t"

+        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"

+        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"

+        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"

+        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [result1] "=&r" (result1), [result2] "=&r" (result2),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_16_64] "r" (cospi_16_64),

+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

+    );

+    __asm__ __volatile__ (

+        "lh       %[load1],             8(%[input])                     \n\t"

+        "lh       %[load2],             56(%[input])                    \n\t"

+        "lh       %[load3],             40(%[input])                    \n\t"

+        "lh       %[load4],             24(%[input])                    \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"

+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"

+        "extp     %[temp0],             $ac1,           31              \n\t"

+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"

+        "extp     %[temp3],             $ac3,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"

+        "mthi     $zero,                $ac2                            \n\t"

+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"

+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"

+        "extp     %[temp1],             $ac2,           31              \n\t"

+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"

+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"

+        "extp     %[temp2],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"

+        "mthi     $zero,                $ac3                            \n\t"

+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"

+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"

+        "add      %[load1],             %[load1],       %[temp1]        \n\t"

+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"

+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"

+        "add      %[load2],             %[load2],       %[temp3]        \n\t"

+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"

+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_5],           $ac1,           31              \n\t"

+        "extp     %[step1_6],           $ac3,           31              \n\t"

+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"

+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"

+        : [load1] "=&r" (load1), [load2] "=&r" (load2),

+          [load3] "=&r" (load3), [load4] "=&r" (load4),

+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    step2_0 = step1_0 + step1_7;

+    step2_1 = step1_1 + step1_6;

+    step2_2 = step1_2 + step1_5;

+    step2_3 = step1_3 + step1_4;

+    step2_4 = step1_3 - step1_4;

+    step2_5 = step1_2 - step1_5;

+    step2_6 = step1_1 - step1_6;

+    step2_7 = step1_0 - step1_7;

+    step1_0 = step2_0 + step3_15;

+    step1_1 = step2_1 + step3_14;

+    step1_2 = step2_2 + step3_13;

+    step1_3 = step2_3 + step3_12;

+    step1_4 = step2_4 + step3_11;

+    step1_5 = step2_5 + step3_10;

+    step1_6 = step2_6 + step3_9;

+    step1_7 = step2_7 + step3_8;

+    step1_8 = step2_7 - step3_8;

+    step1_9 = step2_6 - step3_9;

+    step1_10 = step2_5 - step3_10;

+    step1_11 = step2_4 - step3_11;

+    step1_12 = step2_3 - step3_12;

+    step1_13 = step2_2 - step3_13;

+    step1_14 = step2_1 - step3_14;

+    step1_15 = step2_0 - step3_15;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_20],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_20 + step2_27) * cospi_16_64;

+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_21],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_21 + step2_26) * cospi_16_64;

+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_22],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_22 + step2_25) * cospi_16_64;

+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    __asm__ __volatile__ (

+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_23],          $ac0,           31              \n\t"

+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),

+          [cospi_16_64] "r" (cospi_16_64)

+    );

+    temp21 = (step2_23 + step2_24) * cospi_16_64;

+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

+    // final stage

+    output[0 * 32] = step1_0 + step2_31;

+    output[1 * 32] = step1_1 + step2_30;

+    output[2 * 32] = step1_2 + step2_29;

+    output[3 * 32] = step1_3 + step2_28;

+    output[4 * 32] = step1_4 + step1_27;

+    output[5 * 32] = step1_5 + step1_26;

+    output[6 * 32] = step1_6 + step1_25;

+    output[7 * 32] = step1_7 + step1_24;

+    output[8 * 32] = step1_8 + step1_23;

+    output[9 * 32] = step1_9 + step1_22;

+    output[10 * 32] = step1_10 + step1_21;

+    output[11 * 32] = step1_11 + step1_20;

+    output[12 * 32] = step1_12 + step2_19;

+    output[13 * 32] = step1_13 + step2_18;

+    output[14 * 32] = step1_14 + step2_17;

+    output[15 * 32] = step1_15 + step2_16;

+    output[16 * 32] = step1_15 - step2_16;

+    output[17 * 32] = step1_14 - step2_17;

+    output[18 * 32] = step1_13 - step2_18;

+    output[19 * 32] = step1_12 - step2_19;

+    output[20 * 32] = step1_11 - step1_20;

+    output[21 * 32] = step1_10 - step1_21;

+    output[22 * 32] = step1_9 - step1_22;

+    output[23 * 32] = step1_8 - step1_23;

+    output[24 * 32] = step1_7 - step1_24;

+    output[25 * 32] = step1_6 - step1_25;

+    output[26 * 32] = step1_5 - step1_26;

+    output[27 * 32] = step1_4 - step1_27;

+    output[28 * 32] = step1_3 - step2_28;

+    output[29 * 32] = step1_2 - step2_29;

+    output[30 * 32] = step1_1 - step2_30;

+    output[31 * 32] = step1_0 - step2_31;

+    input += 32;

+    output += 1;

+  }

+}

+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,

+                                  int dest_stride) {

+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);

+  int16_t *outptr = out;

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // Rows

+  idct32_rows_dspr2(input, outptr, 32);

+  // Columns

+  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);

+}

+void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,

+                                int stride) {

+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);

+  int16_t *outptr = out;

+  uint32_t i;

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // Rows

+  idct32_rows_dspr2(input, outptr, 8);

+  outptr += 8;

+  __asm__ __volatile__ (

+      "sw     $zero,      0(%[outptr])     \n\t"

+      "sw     $zero,      4(%[outptr])     \n\t"

+      "sw     $zero,      8(%[outptr])     \n\t"

+      "sw     $zero,     12(%[outptr])     \n\t"

+      "sw     $zero,     16(%[outptr])     \n\t"

+      "sw     $zero,     20(%[outptr])     \n\t"

+      "sw     $zero,     24(%[outptr])     \n\t"

+      "sw     $zero,     28(%[outptr])     \n\t"

+      "sw     $zero,     32(%[outptr])     \n\t"

+      "sw     $zero,     36(%[outptr])     \n\t"

+      "sw     $zero,     40(%[outptr])     \n\t"

+      "sw     $zero,     44(%[outptr])     \n\t"

+      :

+      : [outptr] "r" (outptr)

+  );

+  for (i = 0; i < 31; ++i) {

+    outptr += 32;

+    __asm__ __volatile__ (

+        "sw     $zero,      0(%[outptr])     \n\t"

+        "sw     $zero,      4(%[outptr])     \n\t"

+        "sw     $zero,      8(%[outptr])     \n\t"

+        "sw     $zero,     12(%[outptr])     \n\t"

+        "sw     $zero,     16(%[outptr])     \n\t"

+        "sw     $zero,     20(%[outptr])     \n\t"

+        "sw     $zero,     24(%[outptr])     \n\t"

+        "sw     $zero,     28(%[outptr])     \n\t"

+        "sw     $zero,     32(%[outptr])     \n\t"

+        "sw     $zero,     36(%[outptr])     \n\t"

+        "sw     $zero,     40(%[outptr])     \n\t"

+        "sw     $zero,     44(%[outptr])     \n\t"

+        :

+        : [outptr] "r" (outptr)

+    );

+  }

+  // Columns

+  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);

+}

+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,

+                               int stride) {

+  int       r, out;

+  int32_t   a1, absa1;

+  int32_t   vector_a1;

+  int32_t   t1, t2, t3, t4;

+  int32_t   vector_1, vector_2, vector_3, vector_4;

+  uint32_t  pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

+  __asm__ __volatile__ (

+      "addi     %[out],    %[out],    32      \n\t"

+      "sra      %[a1],     %[out],    6       \n\t"

+      : [out] "+r" (out), [a1] "=r" (a1)

+      :

+  );

+  if (a1 < 0) {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "abs        %[absa1],     %[a1]         \n\t"

+        "replv.qb   %[vector_a1], %[absa1]      \n\t"

+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 32; r--;) {

+      __asm__ __volatile__ (

+          "lw             %[t1],          0(%[dest])                      \n\t"

+          "lw             %[t2],          4(%[dest])                      \n\t"

+          "lw             %[t3],          8(%[dest])                      \n\t"

+          "lw             %[t4],          12(%[dest])                     \n\t"

+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

+          "sw             %[vector_1],    0(%[dest])                      \n\t"

+          "sw             %[vector_2],    4(%[dest])                      \n\t"

+          "sw             %[vector_3],    8(%[dest])                      \n\t"

+          "sw             %[vector_4],    12(%[dest])                     \n\t"

+          "lw             %[t1],          16(%[dest])                     \n\t"

+          "lw             %[t2],          20(%[dest])                     \n\t"

+          "lw             %[t3],          24(%[dest])                     \n\t"

+          "lw             %[t4],          28(%[dest])                     \n\t"

+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

+          "sw             %[vector_1],    16(%[dest])                     \n\t"

+          "sw             %[vector_2],    20(%[dest])                     \n\t"

+          "sw             %[vector_3],    24(%[dest])                     \n\t"

+          "sw             %[vector_4],    28(%[dest])                     \n\t"

+          "add            %[dest],        %[dest],        %[stride]       \n\t"

+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

+            [dest] "+&r" (dest)

+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  } else {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "replv.qb       %[vector_a1],   %[a1]     \n\t"

+        : [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 32; r--;) {

+      __asm__ __volatile__ (

+          "lw             %[t1],          0(%[dest])                      \n\t"

+          "lw             %[t2],          4(%[dest])                      \n\t"

+          "lw             %[t3],          8(%[dest])                      \n\t"

+          "lw             %[t4],          12(%[dest])                     \n\t"

+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

+          "sw             %[vector_1],    0(%[dest])                      \n\t"

+          "sw             %[vector_2],    4(%[dest])                      \n\t"

+          "sw             %[vector_3],    8(%[dest])                      \n\t"

+          "sw             %[vector_4],    12(%[dest])                     \n\t"

+          "lw             %[t1],          16(%[dest])                     \n\t"

+          "lw             %[t2],          20(%[dest])                     \n\t"

+          "lw             %[t3],          24(%[dest])                     \n\t"

+          "lw             %[t4],          28(%[dest])                     \n\t"

+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"

+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"

+          "sw             %[vector_1],    16(%[dest])                     \n\t"

+          "sw             %[vector_2],    20(%[dest])                     \n\t"

+          "sw             %[vector_3],    24(%[dest])                     \n\t"

+          "sw             %[vector_4],    28(%[dest])                     \n\t"

+          "add            %[dest],        %[dest],        %[stride]       \n\t"

+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

+            [dest] "+&r" (dest)

+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  }

+}

+#endif  // #if HAVE_DSPR2

--- /dev/null

+++ b/vpx_dsp/mips/itrans4_dspr2.c

@@ -1,0 +1,359 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

+#include "vpx_dsp/txfm_common.h"

+#if HAVE_DSPR2

+void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {

+  int16_t   step_0, step_1, step_2, step_3;

+  int       Temp0, Temp1, Temp2, Temp3;

+  const int const_2_power_13 = 8192;

+  int       i;

+  for (i = 4; i--; ) {

+    __asm__ __volatile__ (

+        /*

+          temp_1 = (input[0] + input[2]) * cospi_16_64;

+          step_0 = dct_const_round_shift(temp_1);

+          temp_2 = (input[0] - input[2]) * cospi_16_64;

+          step_1 = dct_const_round_shift(temp_2);

+        */

+        "lh       %[Temp0],             0(%[input])                     \n\t"

+        "lh       %[Temp1],             4(%[input])                     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

+        "lh       %[Temp0],             2(%[input])                     \n\t"

+        "lh       %[Temp1],             6(%[input])                     \n\t"

+        "extp     %[step_0],            $ac0,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

+        "extp     %[step_1],            $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        /*

+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+          step_2 = dct_const_round_shift(temp1);

+        */

+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

+        "extp     %[step_2],            $ac0,           31              \n\t"

+        /*

+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+          step_3 = dct_const_round_shift(temp2);

+        */

+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

+        "extp     %[step_3],            $ac1,           31              \n\t"

+        /*

+          output[0]  = step_0 + step_3;

+          output[4]  = step_1 + step_2;

+          output[8]  = step_1 - step_2;

+          output[12] = step_0 - step_3;

+        */

+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"

+        "sh       %[Temp0],             0(%[output])                    \n\t"

+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"

+        "sh       %[Temp1],             8(%[output])                    \n\t"

+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"

+        "sh       %[Temp2],             16(%[output])                   \n\t"

+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"

+        "sh       %[Temp3],             24(%[output])                   \n\t"

+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),

+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),

+        [output] "+r" (output)

+      : [const_2_power_13] "r" (const_2_power_13),

+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),

+        [cospi_24_64] "r" (cospi_24_64),

+        [input] "r" (input)

+    );

+    input += 4;

+    output += 1;

+  }

+}

+void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                                     int dest_stride) {

+  int16_t   step_0, step_1, step_2, step_3;

+  int       Temp0, Temp1, Temp2, Temp3;

+  const int const_2_power_13 = 8192;

+  int       i;

+  uint8_t   *dest_pix;

+  uint8_t   *cm = vpx_ff_cropTbl;

+  /* prefetch vpx_ff_cropTbl */

+  prefetch_load(vpx_ff_cropTbl);

+  prefetch_load(vpx_ff_cropTbl +  32);

+  prefetch_load(vpx_ff_cropTbl +  64);

+  prefetch_load(vpx_ff_cropTbl +  96);

+  prefetch_load(vpx_ff_cropTbl + 128);

+  prefetch_load(vpx_ff_cropTbl + 160);

+  prefetch_load(vpx_ff_cropTbl + 192);

+  prefetch_load(vpx_ff_cropTbl + 224);

+  for (i = 0; i < 4; ++i) {

+      dest_pix = (dest + i);

+    __asm__ __volatile__ (

+        /*

+          temp_1 = (input[0] + input[2]) * cospi_16_64;

+          step_0 = dct_const_round_shift(temp_1);

+          temp_2 = (input[0] - input[2]) * cospi_16_64;

+          step_1 = dct_const_round_shift(temp_2);

+        */

+        "lh       %[Temp0],             0(%[input])                     \n\t"

+        "lh       %[Temp1],             4(%[input])                     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

+        "lh       %[Temp0],             2(%[input])                     \n\t"

+        "lh       %[Temp1],             6(%[input])                     \n\t"

+        "extp     %[step_0],            $ac0,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

+        "extp     %[step_1],            $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        /*

+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+          step_2 = dct_const_round_shift(temp1);

+        */

+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

+        "extp     %[step_2],            $ac0,           31              \n\t"

+        /*

+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+          step_3 = dct_const_round_shift(temp2);

+        */

+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

+        "extp     %[step_3],            $ac1,           31              \n\t"

+        /*

+          output[0]  = step_0 + step_3;

+          output[4]  = step_1 + step_2;

+          output[8]  = step_1 - step_2;

+          output[12] = step_0 - step_3;

+        */

+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"

+        "addi     %[Temp0],             %[Temp0],       8               \n\t"

+        "sra      %[Temp0],             %[Temp0],       4               \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "addi     %[Temp0],             %[Temp0],       8               \n\t"

+        "sra      %[Temp0],             %[Temp0],       4               \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "addi     %[Temp0],             %[Temp0],       8               \n\t"

+        "sra      %[Temp0],             %[Temp0],       4               \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "addi     %[Temp0],             %[Temp0],       8               \n\t"

+        "sra      %[Temp0],             %[Temp0],       4               \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),

+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),

+        [dest_pix] "+r" (dest_pix)

+      : [const_2_power_13] "r" (const_2_power_13),

+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),

+        [cospi_24_64] "r" (cospi_24_64),

+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)

+    );

+    input += 4;

+  }

+}

+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,

+                              int dest_stride) {

+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);

+  int16_t *outptr = out;

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // Rows

+  vp9_idct4_rows_dspr2(input, outptr);

+  // Columns

+  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);

+}

+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,

+                             int dest_stride) {

+  int       a1, absa1;

+  int       r;

+  int32_t   out;

+  int       t2, vector_a1, vector_a;

+  uint32_t  pos = 45;

+  int16_t   input_dc = input[0];

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);

+  __asm__ __volatile__ (

+      "addi     %[out],     %[out],    8       \n\t"

+      "sra      %[a1],      %[out],    4       \n\t"

+      : [out] "+r" (out), [a1] "=r" (a1)

+      :

+  );

+  if (a1 < 0) {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "abs        %[absa1],     %[a1]         \n\t"

+        "replv.qb   %[vector_a1], %[absa1]      \n\t"

+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 4; r--;) {

+      __asm__ __volatile__ (

+          "lw             %[t2],          0(%[dest])                      \n\t"

+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"

+          "sw             %[vector_a],    0(%[dest])                      \n\t"

+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"

+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),

+            [dest] "+&r" (dest)

+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  } else {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "replv.qb       %[vector_a1],   %[a1]     \n\t"

+        : [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 4; r--;) {

+      __asm__ __volatile__ (

+          "lw           %[t2],          0(%[dest])                        \n\t"

+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"

+          "sw           %[vector_a],    0(%[dest])                        \n\t"

+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"

+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),

+            [dest] "+&r" (dest)

+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  }

+}

+void iadst4_dspr2(const int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  int x0, x1, x2, x3;

+  x0 = input[0];

+  x1 = input[1];

+  x2 = input[2];

+  x3 = input[3];

+  if (!(x0 | x1 | x2 | x3)) {

+    output[0] = output[1] = output[2] = output[3] = 0;

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = x0 - x2 + x3;

+  x0 = s0 + s3 + s5;

+  x1 = s1 - s4 - s6;

+  x2 = sinpi_3_9 * s7;

+  x3 = s2;

+  s0 = x0 + x3;

+  s1 = x1 + x3;

+  s2 = x2;

+  s3 = x0 + x1 - x3;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = dct_const_round_shift(s0);

+  output[1] = dct_const_round_shift(s1);

+  output[2] = dct_const_round_shift(s2);

+  output[3] = dct_const_round_shift(s3);

+}

+#endif  // #if HAVE_DSPR2

--- /dev/null

+++ b/vpx_dsp/mips/itrans8_dspr2.c

@@ -1,0 +1,668 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/mips/inv_txfm_dspr2.h"

+#include "vpx_dsp/txfm_common.h"

+#if HAVE_DSPR2

+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {

+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

+  const int const_2_power_13 = 8192;

+  int Temp0, Temp1, Temp2, Temp3, Temp4;

+  int i;

+  for (i = no_rows; i--; ) {

+    __asm__ __volatile__ (

+        /*

+          temp_1 = (input[0] + input[4]) * cospi_16_64;

+          step2_0 = dct_const_round_shift(temp_1);

+          temp_2 = (input[0] - input[4]) * cospi_16_64;

+          step2_1 = dct_const_round_shift(temp_2);

+        */

+        "lh       %[Temp0],             0(%[input])                     \n\t"

+        "lh       %[Temp1],             8(%[input])                     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

+        "extp     %[Temp4],             $ac0,           31              \n\t"

+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "extp     %[Temp2],             $ac1,           31              \n\t"

+        /*

+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;

+          step2_2 = dct_const_round_shift(temp_1);

+        */

+        "lh       %[Temp0],             4(%[input])                     \n\t"

+        "lh       %[Temp1],             12(%[input])                    \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "extp     %[Temp3],             $ac0,           31              \n\t"

+        /*

+          step1_1 = step2_1 + step2_2;

+          step1_2 = step2_1 - step2_2;

+        */

+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"

+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"

+        /*

+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;

+          step2_3 = dct_const_round_shift(temp_2);

+        */

+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

+        "extp     %[Temp1],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        /*

+          step1_0 = step2_0 + step2_3;

+          step1_3 = step2_0 - step2_3;

+        */

+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"

+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"

+        /*

+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+          step1_4 = dct_const_round_shift(temp_1);

+        */

+        "lh       %[Temp0],             2(%[input])                     \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "lh       %[Temp1],             14(%[input])                    \n\t"

+        "lh       %[Temp0],             2(%[input])                     \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"

+        "extp     %[step1_4],           $ac0,           31              \n\t"

+        /*

+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+          step1_7 = dct_const_round_shift(temp_2);

+        */

+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"

+        "extp     %[step1_7],           $ac1,           31              \n\t"

+        /*

+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+          step1_5 = dct_const_round_shift(temp_1);

+        */

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "lh       %[Temp0],             10(%[input])                    \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"

+        "lh       %[Temp1],             6(%[input])                     \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"

+        "extp     %[step1_5],           $ac0,           31              \n\t"

+        /*

+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+          step1_6 = dct_const_round_shift(temp_2);

+        */

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "lh       %[Temp0],             10(%[input])                    \n\t"

+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"

+        "lh       %[Temp1],             6(%[input])                     \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"

+        "extp     %[step1_6],           $ac1,           31              \n\t"

+        /*

+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;

+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;

+        */

+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"

+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"

+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"

+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"

+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"

+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"

+        /*

+          step1_4 = step1_4 + step1_5;

+          step1_7 = step1_6 + step1_7;

+        */

+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"

+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"

+        "extp     %[step1_5],           $ac0,           31              \n\t"

+        "extp     %[step1_6],           $ac1,           31              \n\t"

+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"

+        "sh       %[Temp0],             0(%[output])                    \n\t"

+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"

+        "sh       %[Temp1],             16(%[output])                   \n\t"

+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"

+        "sh       %[Temp0],             32(%[output])                   \n\t"

+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"

+        "sh       %[Temp1],             48(%[output])                   \n\t"

+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"

+        "sh       %[Temp0],             64(%[output])                   \n\t"

+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"

+        "sh       %[Temp1],             80(%[output])                   \n\t"

+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"

+        "sh       %[Temp0],             96(%[output])                   \n\t"

+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"

+        "sh       %[Temp1],             112(%[output])                  \n\t"

+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),

+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),

+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),

+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),

+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+          [Temp4] "=&r" (Temp4)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),

+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),

+          [cospi_24_64] "r" (cospi_24_64),

+          [output] "r" (output), [input] "r" (input)

+    );

+    input += 8;

+    output += 1;

+  }

+}

+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,

+                                 int dest_stride) {

+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

+  int Temp0, Temp1, Temp2, Temp3;

+  int i;

+  const int const_2_power_13 = 8192;

+  uint8_t *dest_pix;

+  uint8_t *cm = vpx_ff_cropTbl;

+  /* prefetch vpx_ff_cropTbl */

+  prefetch_load(vpx_ff_cropTbl);

+  prefetch_load(vpx_ff_cropTbl +  32);

+  prefetch_load(vpx_ff_cropTbl +  64);

+  prefetch_load(vpx_ff_cropTbl +  96);

+  prefetch_load(vpx_ff_cropTbl + 128);

+  prefetch_load(vpx_ff_cropTbl + 160);

+  prefetch_load(vpx_ff_cropTbl + 192);

+  prefetch_load(vpx_ff_cropTbl + 224);

+  for (i = 0; i < 8; ++i) {

+      dest_pix = (dest + i);

+    __asm__ __volatile__ (

+        /*

+          temp_1 = (input[0] + input[4]) * cospi_16_64;

+          step2_0 = dct_const_round_shift(temp_1);

+          temp_2 = (input[0] - input[4]) * cospi_16_64;

+          step2_1 = dct_const_round_shift(temp_2);

+        */

+        "lh       %[Temp0],             0(%[input])                     \n\t"

+        "lh       %[Temp1],             8(%[input])                     \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"

+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"

+        "extp     %[step1_6],           $ac0,           31              \n\t"

+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"

+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "extp     %[Temp2],             $ac1,           31              \n\t"

+        /*

+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;

+          step2_2 = dct_const_round_shift(temp_1);

+        */

+        "lh       %[Temp0],             4(%[input])                     \n\t"

+        "lh       %[Temp1],             12(%[input])                    \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "extp     %[Temp3],             $ac0,           31              \n\t"

+        /*

+          step1_1 = step2_1 + step2_2;

+          step1_2 = step2_1 - step2_2;

+        */

+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"

+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"

+        /*

+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;

+          step2_3 = dct_const_round_shift(temp_2);

+        */

+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"

+        "extp     %[Temp1],             $ac1,           31              \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        /*

+          step1_0 = step2_0 + step2_3;

+          step1_3 = step2_0 - step2_3;

+        */

+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"

+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"

+        /*

+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+          step1_4 = dct_const_round_shift(temp_1);

+        */

+        "lh       %[Temp0],             2(%[input])                     \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "lh       %[Temp1],             14(%[input])                    \n\t"

+        "lh       %[Temp0],             2(%[input])                     \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"

+        "extp     %[step1_4],           $ac0,           31              \n\t"

+        /*

+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+          step1_7 = dct_const_round_shift(temp_2);

+        */

+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"

+        "extp     %[step1_7],           $ac1,           31              \n\t"

+        /*

+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+          step1_5 = dct_const_round_shift(temp_1);

+        */

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "lh       %[Temp0],             10(%[input])                    \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"

+        "lh       %[Temp1],             6(%[input])                     \n\t"

+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"

+        "extp     %[step1_5],           $ac0,           31              \n\t"

+        /*

+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+          step1_6 = dct_const_round_shift(temp_2);

+        */

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "lh       %[Temp0],             10(%[input])                    \n\t"

+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"

+        "lh       %[Temp1],             6(%[input])                     \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"

+        "extp     %[step1_6],           $ac1,           31              \n\t"

+        /*

+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;

+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;

+        */

+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"

+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"

+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"

+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"

+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"

+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"

+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"

+        "mthi     $zero,                $ac0                            \n\t"

+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"

+        "mthi     $zero,                $ac1                            \n\t"

+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"

+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"

+        /*

+          step1_4 = step1_4 + step1_5;

+          step1_7 = step1_6 + step1_7;

+        */

+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"

+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"

+        "extp     %[step1_5],           $ac0,           31              \n\t"

+        "extp     %[step1_6],           $ac1,           31              \n\t"

+        /* add block */

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"

+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"

+        "addi     %[Temp0],             %[Temp0],       16              \n\t"

+        "sra      %[Temp0],             %[Temp0],       5               \n\t"

+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"

+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"

+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"

+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),

+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),

+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),

+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),

+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+          [dest_pix] "+r" (dest_pix)

+        : [const_2_power_13] "r" (const_2_power_13),

+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),

+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),

+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),

+          [cospi_24_64] "r" (cospi_24_64),

+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)

+    );

+    input += 8;

+  }

+}

+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,

+                              int dest_stride) {

+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

+  int16_t *outptr = out;

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp    %[pos],    1    \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // First transform rows

+  idct8_rows_dspr2(input, outptr, 8);

+  // Then transform columns and add to dest

+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);

+}

+void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,

+                              int dest_stride) {

+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

+  int16_t *outptr = out;

+  uint32_t pos = 45;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp    %[pos],    1    \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  // First transform rows

+  idct8_rows_dspr2(input, outptr, 4);

+  outptr += 4;

+  __asm__ __volatile__ (

+      "sw  $zero,   0(%[outptr])  \n\t"

+      "sw  $zero,   4(%[outptr])  \n\t"

+      "sw  $zero,  16(%[outptr])  \n\t"

+      "sw  $zero,  20(%[outptr])  \n\t"

+      "sw  $zero,  32(%[outptr])  \n\t"

+      "sw  $zero,  36(%[outptr])  \n\t"

+      "sw  $zero,  48(%[outptr])  \n\t"

+      "sw  $zero,  52(%[outptr])  \n\t"

+      "sw  $zero,  64(%[outptr])  \n\t"

+      "sw  $zero,  68(%[outptr])  \n\t"

+      "sw  $zero,  80(%[outptr])  \n\t"

+      "sw  $zero,  84(%[outptr])  \n\t"

+      "sw  $zero,  96(%[outptr])  \n\t"

+      "sw  $zero, 100(%[outptr])  \n\t"

+      "sw  $zero, 112(%[outptr])  \n\t"

+      "sw  $zero, 116(%[outptr])  \n\t"

+      :

+      : [outptr] "r" (outptr)

+  );

+  // Then transform columns and add to dest

+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);

+}

+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,

+                             int dest_stride) {

+  uint32_t pos = 45;

+  int32_t out;

+  int32_t r;

+  int32_t a1, absa1;

+  int32_t t1, t2, vector_a1, vector_1, vector_2;

+  /* bit positon for extract from acc */

+  __asm__ __volatile__ (

+    "wrdsp      %[pos],     1           \n\t"

+    :

+    : [pos] "r" (pos)

+  );

+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

+  __asm__ __volatile__ (

+      "addi     %[out],     %[out],     16      \n\t"

+      "sra      %[a1],      %[out],     5       \n\t"

+      : [out] "+r" (out), [a1] "=r" (a1)

+      :

+  );

+  if (a1 < 0) {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "abs        %[absa1],       %[a1]       \n\t"

+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"

+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 8; r--;) {

+      __asm__ __volatile__ (

+          "lw           %[t1],          0(%[dest])                      \n\t"

+          "lw           %[t2],          4(%[dest])                      \n\t"

+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "sw           %[vector_1],    0(%[dest])                      \n\t"

+          "sw           %[vector_2],    4(%[dest])                      \n\t"

+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"

+          : [t1] "=&r" (t1), [t2] "=&r" (t2),

+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+            [dest] "+&r" (dest)

+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  } else {

+    /* use quad-byte

+     * input and output memory are four byte aligned */

+    __asm__ __volatile__ (

+        "replv.qb   %[vector_a1],   %[a1]   \n\t"

+        : [vector_a1] "=r" (vector_a1)

+        : [a1] "r" (a1)

+    );

+    for (r = 8; r--;) {

+      __asm__ __volatile__ (

+          "lw           %[t1],          0(%[dest])                      \n\t"

+          "lw           %[t2],          4(%[dest])                      \n\t"

+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"

+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"

+          "sw           %[vector_1],    0(%[dest])                      \n\t"

+          "sw           %[vector_2],    4(%[dest])                      \n\t"

+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"

+          : [t1] "=&r" (t1), [t2] "=&r" (t2),

+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

+            [dest] "+r" (dest)

+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

+      );

+    }

+  }

+}

+void iadst8_dspr2(const int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  int x0, x1, x2, x3, x4, x5, x6, x7;

+  x0 = input[7];

+  x1 = input[0];

+  x2 = input[5];

+  x3 = input[2];

+  x4 = input[3];

+  x5 = input[4];

+  x6 = input[1];

+  x7 = input[6];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = 0;

+    return;

+  }

+  // stage 1

+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);

+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);

+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);

+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);

+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);

+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);

+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);

+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);

+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);

+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);

+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);

+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);

+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);

+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);

+  output[0] =  x0;

+  output[1] = -x4;

+  output[2] =  x6;

+  output[3] = -x2;

+  output[4] =  x3;

+  output[5] = -x7;

+  output[6] =  x5;

+  output[7] = -x1;

+}

+#endif  // HAVE_DSPR2

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -213,6 +213,13 @@

 DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c

 DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c

 DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c

+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h

+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c

+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c

+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c

+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c

+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c

 endif  # CONFIG_VP9

 # quantization

--

⑨