shithub: libvpx

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -120,7 +120,7 @@

     // Because the bitstream is not frozen yet, use the idct in the codebase.

-    vp9_short_idct4x4llm_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);

     for (int j = 0; j < 16; ++j) {

       const int diff = test_input_block[j] - test_output_block[j];

--- /dev/null

+++ b/test/idct_test.cc

@@ -1,0 +1,118 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+extern "C" {

+#include "./vpx_config.h"

+#include "./vp8_rtcd.h"

+}

+#include "test/register_state_check.h"

+#include "third_party/googletest/src/include/gtest/gtest.h"

+typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,

+                          int pred_stride, unsigned char *dst_ptr,

+                          int dst_stride);

+namespace {

+class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {

+  protected:

+    virtual void SetUp() {

+        int i;

+        UUT = GetParam();

+        memset(input, 0, sizeof(input));

+        /* Set up guard blocks */

+        for (i = 0; i < 256; i++)

+            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;

+    }

+    idct_fn_t UUT;

+    short input[16];

+    unsigned char output[256];

+    unsigned char predict[256];

+};

+TEST_P(IDCTTest, TestGuardBlocks) {

+    int i;

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(0, output[i]) << i;

+        else

+            EXPECT_EQ(255, output[i]);

+}

+TEST_P(IDCTTest, TestAllZeros) {

+    int i;

+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(0, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+}

+TEST_P(IDCTTest, TestAllOnes) {

+    int i;

+    input[0] = 4;

+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(1, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+}

+TEST_P(IDCTTest, TestAddOne) {

+    int i;

+    for (i = 0; i < 256; i++)

+        predict[i] = i;

+    input[0] = 4;

+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(i+1, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+}

+TEST_P(IDCTTest, TestWithData) {

+    int i;

+    for (i = 0; i < 16; i++)

+        input[i] = i;

+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) > 3 || i > 63)

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+        else if (i == 0)

+            EXPECT_EQ(11, output[i]) << "i==" << i;

+        else if (i == 34)

+            EXPECT_EQ(1, output[i]) << "i==" << i;

+        else if (i == 2 || i == 17 || i == 32)

+            EXPECT_EQ(3, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(0, output[i]) << "i==" << i;

+}

+INSTANTIATE_TEST_CASE_P(C, IDCTTest,

+                        ::testing::Values(vp8_short_idct4x4llm_c));

+#if HAVE_MMX

+INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,

+                        ::testing::Values(vp8_short_idct4x4llm_mmx));

+#endif

+}

--- a/test/idctllm_test.cc

+++ /dev/null

@@ -1,126 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-extern "C" {

-#include "vpx_config.h"

-#include "vp8_rtcd.h"

-}

-#include "test/register_state_check.h"

-#include "third_party/googletest/src/include/gtest/gtest.h"

-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,

-                          int pred_stride, unsigned char *dst_ptr,

-                          int dst_stride);

-namespace {

-class IDCTTest : public ::testing::TestWithParam<idct_fn_t>

-{

-  protected:

-    virtual void SetUp()

-    {

-        int i;

-        UUT = GetParam();

-        memset(input, 0, sizeof(input));

-        /* Set up guard blocks */

-        for(i=0; i<256; i++)

-            output[i] = ((i&0xF)<4&&(i<64))?0:-1;

-    }

-    idct_fn_t UUT;

-    short input[16];

-    unsigned char output[256];

-    unsigned char predict[256];

-};

-TEST_P(IDCTTest, TestGuardBlocks)

-{

-    int i;

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(0, output[i]) << i;

-        else

-            EXPECT_EQ(255, output[i]);

-}

-TEST_P(IDCTTest, TestAllZeros)

-{

-    int i;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(0, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-}

-TEST_P(IDCTTest, TestAllOnes)

-{

-    int i;

-    input[0] = 4;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(1, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-}

-TEST_P(IDCTTest, TestAddOne)

-{

-    int i;

-    for(i=0; i<256; i++)

-        predict[i] = i;

-    input[0] = 4;

-    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(i+1, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-}

-TEST_P(IDCTTest, TestWithData)

-{

-    int i;

-    for(i=0; i<16; i++)

-        input[i] = i;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) > 3 || i>63)

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-        else if(i == 0)

-            EXPECT_EQ(11, output[i]) << "i==" << i;

-        else if(i == 34)

-            EXPECT_EQ(1, output[i]) << "i==" << i;

-        else if(i == 2 || i == 17 || i == 32)

-            EXPECT_EQ(3, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(0, output[i]) << "i==" << i;

-}

-INSTANTIATE_TEST_CASE_P(C, IDCTTest,

-                        ::testing::Values(vp8_short_idct4x4llm_c));

-#if HAVE_MMX

-INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,

-                        ::testing::Values(vp8_short_idct4x4llm_mmx));

-#endif

-}

--- a/test/test.mk

+++ b/test/test.mk

@@ -47,7 +47,7 @@

 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc

 endif

-LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc

+LIBVPX_TEST_SRCS-yes                   += idct_test.cc

 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc

--- /dev/null

+++ b/vp9/common/ppc/vp9_idct_altivec.asm

@@ -1,0 +1,189 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl short_idct4x4_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+;# r3 short *input

+;# r4 short *output

+;# r5 int pitch

+    .align 2

+short_idct4x4_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    load_c v8, sinpi8sqrt2, 0, r9, r10

+    load_c v9, cospi8sqrt2minus1, 0, r9, r10

+    load_c v10, hi_hi, 0, r9, r10

+    load_c v11, lo_lo, 0, r9, r10

+    load_c v12, shift_16, 0, r9, r10

+    li      r10,  16

+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

+    lvx     v1, r10, r3         ;# input ip[8], ip[12]

+    ;# first pass

+    vupkhsh v2, v0

+    vupkhsh v3, v1

+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

+    vupklsh v0, v0

+    vmulosh v4, v0, v8

+    vsraw   v4, v4, v12

+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

+    vupklsh v1, v1

+    vmulosh v5, v1, v9

+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v1

+    vsubsws v4, v4, v5          ;# c1

+    vmulosh v3, v1, v8

+    vsraw   v3, v3, v12

+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v0, v9

+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v0

+    vaddsws v3, v3, v5          ;# d1

+    vaddsws v0, v6, v3          ;# a1 + d1

+    vsubsws v3, v6, v3          ;# a1 - d1

+    vaddsws v1, v7, v4          ;# b1 + c1

+    vsubsws v2, v7, v4          ;# b1 - c1

+    ;# transpose input

+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

+    ;# second pass

+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

+    vmulosh v4, v1, v8

+    vsraw   v4, v4, v12

+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v3, v9

+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v3

+    vsubsws v4, v4, v5          ;# c1

+    vmulosh v2, v3, v8

+    vsraw   v2, v2, v12

+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v1, v9

+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v1

+    vaddsws v3, v2, v5          ;# d1

+    vaddsws v0, v6, v3          ;# a1 + d1

+    vsubsws v3, v6, v3          ;# a1 - d1

+    vaddsws v1, v7, v4          ;# b1 + c1

+    vsubsws v2, v7, v4          ;# b1 - c1

+    vspltish v6, 4

+    vspltish v7, 3

+    vpkswss v0, v0, v1

+    vpkswss v1, v2, v3

+    vaddshs v0, v0, v6

+    vaddshs v1, v1, v6

+    vsrah   v0, v0, v7

+    vsrah   v1, v1, v7

+    ;# transpose output

+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

+    stwu    r1,-416(r1)         ;# create space on the stack

+    stvx    v0,  0, r1

+    lwz     r6, 0(r1)

+    stw     r6, 0(r4)

+    lwz     r6, 4(r1)

+    stw     r6, 4(r4)

+    add     r4, r4, r5

+    lwz     r6,  8(r1)

+    stw     r6,  0(r4)

+    lwz     r6, 12(r1)

+    stw     r6,  4(r4)

+    add     r4, r4, r5

+    stvx    v1,  0, r1

+    lwz     r6, 0(r1)

+    stw     r6, 0(r4)

+    lwz     r6, 4(r1)

+    stw     r6, 4(r4)

+    add     r4, r4, r5

+    lwz     r6,  8(r1)

+    stw     r6,  0(r4)

+    lwz     r6, 12(r1)

+    stw     r6,  4(r4)

+    addi    r1, r1, 416         ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 4

+sinpi8sqrt2:

+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

+    .align 4

+cospi8sqrt2minus1:

+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

+    .align 4

+shift_16:

+    .long      16,    16,    16,    16

+    .align 4

+hi_hi:

+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

+    .align 4

+lo_lo:

+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- a/vp9/common/ppc/vp9_idctllm_altivec.asm

+++ /dev/null

@@ -1,189 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl short_idct4x4llm_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-    .align 2

-short_idct4x4llm_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    load_c v8, sinpi8sqrt2, 0, r9, r10

-    load_c v9, cospi8sqrt2minus1, 0, r9, r10

-    load_c v10, hi_hi, 0, r9, r10

-    load_c v11, lo_lo, 0, r9, r10

-    load_c v12, shift_16, 0, r9, r10

-    li      r10,  16

-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

-    lvx     v1, r10, r3         ;# input ip[8], ip[12]

-    ;# first pass

-    vupkhsh v2, v0

-    vupkhsh v3, v1

-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

-    vupklsh v0, v0

-    vmulosh v4, v0, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vupklsh v1, v1

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v3, v1, v8

-    vsraw   v3, v3, v12

-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v0, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v0

-    vaddsws v3, v3, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    ;# transpose input

-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

-    ;# second pass

-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

-    vmulosh v4, v1, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v3, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v3

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v2, v3, v8

-    vsraw   v2, v2, v12

-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vaddsws v3, v2, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    vspltish v6, 4

-    vspltish v7, 3

-    vpkswss v0, v0, v1

-    vpkswss v1, v2, v3

-    vaddshs v0, v0, v6

-    vaddshs v1, v1, v6

-    vsrah   v0, v0, v7

-    vsrah   v1, v1, v7

-    ;# transpose output

-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

-    stwu    r1,-416(r1)         ;# create space on the stack

-    stvx    v0,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    add     r4, r4, r5

-    stvx    v1,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    addi    r1, r1, 416         ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 4

-sinpi8sqrt2:

-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

-    .align 4

-cospi8sqrt2minus1:

-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

-    .align 4

-shift_16:

-    .long      16,    16,    16,    16

-    .align 4

-hi_hi:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

-    .align 4

-lo_lo:

-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- a/vp9/common/ppc/vp9_systemdependent.c

+++ b/vp9/common/ppc/vp9_systemdependent.c

@@ -63,7 +63,7 @@

 void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

 void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);

+extern void short_idct4x4_ppc(short *input, short *output, int pitch);

 // Generic C

 extern subpixel_predict_function vp9_sixtap_predict_c;

@@ -83,8 +83,8 @@

 void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

 void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);

-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);

+extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);

+extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);

 extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

 // PPC

@@ -139,8 +139,8 @@

   vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;

   vp9_sixtap_predict                   = sixtap_predict_ppc;

-  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;

-  vp8_short_idct4x4                    = short_idct4x4llm_ppc;

+  vp8_short_idct4x4_1                  = vp9_short_idct4x4_1_c;

+  vp8_short_idct4x4                    = short_idct4x4_ppc;

   vp8_dc_only_idct                      = vp8_dc_only_idct_c;

   vp8_lf_mbvfull                       = loop_filter_mbv_ppc;

--- /dev/null

+++ b/vp9/common/vp9_idct.c

@@ -1,0 +1,1307 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <math.h>

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "vp9/common/vp9_systemdependent.h"

+#include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_common.h"

+#include "vp9/common/vp9_idct.h"

+void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  int16_t *ip = input;

+  int16_t *op = output;

+  const int half_pitch = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;

+    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;

+    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;

+    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[1] = (c1 + d1) >> 1;

+    op[2] = (a1 - b1) >> 1;

+    op[3] = (d1 - c1) >> 1;

+    ip += 4;

+    op += half_pitch;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];

+    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];

+    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];

+    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];

+    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;

+    op[half_pitch * 1] = (c1 + d1) >> 1;

+    op[half_pitch * 2] = (a1 - b1) >> 1;

+    op[half_pitch * 3] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {

+  int i;

+  int16_t tmp[4];

+  int16_t *ip = in;

+  int16_t *op = tmp;

+  const int half_pitch = pitch >> 1;

+  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

+  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

+  ip = tmp;

+  op = out;

+  for (i = 0; i < 4; i++) {

+    op[half_pitch * 0] = (ip[0] + 1) >> 1;

+    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,

+                                 uint8_t *dst_ptr,

+                                 int pitch, int stride) {

+  int r, c;

+  int16_t dc = input_dc;

+  int16_t tmp[4 * 4];

+  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++)

+      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);

+    dst_ptr += stride;

+    pred_ptr += pitch;

+  }

+}

+void vp9_idct4_1d_c(int16_t *input, int16_t *output) {

+  int16_t step[4];

+  int temp1, temp2;

+  // stage 1

+  temp1 = (input[0] + input[2]) * cospi_16_64;

+  temp2 = (input[0] - input[2]) * cospi_16_64;

+  step[0] = dct_const_round_shift(temp1);

+  step[1] = dct_const_round_shift(temp2);

+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+  step[2] = dct_const_round_shift(temp1);

+  step[3] = dct_const_round_shift(temp2);

+  // stage 2

+  output[0] = step[0] + step[3];

+  output[1] = step[1] + step[2];

+  output[2] = step[1] - step[2];

+  output[3] = step[0] - step[3];

+}

+void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[4 * 4];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[4], temp_out[4];

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = input[j];

+    vp9_idct4_1d(temp_in, outptr);

+    input += 4;

+    outptr += 4;

+  }

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    vp9_idct4_1d(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+  }

+}

+void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {

+  int i;

+  int a1;

+  int16_t *op = output;

+  const int half_pitch = pitch >> 1;

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (i = 0; i < 4; i++) {

+    op[0] = op[1] = op[2] = op[3] = a1;

+    op += half_pitch;

+  }

+}

+void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,

+                            uint8_t *dst_ptr, int pitch, int stride) {

+  int a1;

+  int r, c;

+  int16_t out = dct_const_round_shift(input_dc * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++)

+      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);

+    dst_ptr += stride;

+    pred_ptr += pitch;

+  }

+}

+static void idct8_1d(int16_t *input, int16_t *output) {

+  int16_t step1[8], step2[8];

+  int temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[2] = input[4];

+  step1[1] = input[2];

+  step1[3] = input[6];

+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+  step1[4] = dct_const_round_shift(temp1);

+  step1[7] = dct_const_round_shift(temp2);

+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  // stage 2 & stage 3 - even half

+  vp9_idct4_1d(step1, step1);

+  // stage 2 - odd half

+  step2[4] = step1[4] + step1[5];

+  step2[5] = step1[4] - step1[5];

+  step2[6] = -step1[6] + step1[7];

+  step2[7] = step1[6] + step1[7];

+  // stage 3 -odd half

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[7] = step2[7];

+  // stage 4

+  output[0] = step1[0] + step1[7];

+  output[1] = step1[1] + step1[6];

+  output[2] = step1[2] + step1[5];

+  output[3] = step1[3] + step1[4];

+  output[4] = step1[3] - step1[4];

+  output[5] = step1[2] - step1[5];

+  output[6] = step1[1] - step1[6];

+  output[7] = step1[0] - step1[7];

+}

+void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[8 * 8];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[8], temp_out[8];

+  // Rows

+  for (i = 0; i < 8; ++i) {

+    idct8_1d(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    idct8_1d(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+  }

+}

+static void iadst4_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  int x0 = input[0];

+  int x1 = input[1];

+  int x2 = input[2];

+  int x3 = input[3];

+  if (!(x0 | x1 | x2 | x3)) {

+    output[0] = output[1] = output[2] = output[3] = 0;

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = x0 - x2 + x3;

+  x0 = s0 + s3 + s5;

+  x1 = s1 - s4 - s6;

+  x2 = sinpi_3_9 * s7;

+  x3 = s2;

+  s0 = x0 + x3;

+  s1 = x1 + x3;

+  s2 = x2;

+  s3 = x0 + x1 - x3;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = dct_const_round_shift(s0);

+  output[1] = dct_const_round_shift(s1);

+  output[2] = dct_const_round_shift(s2);

+  output[3] = dct_const_round_shift(s3);

+}

+void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

+                        int pitch, int tx_type) {

+  const transform_2d IHT_4[] = {

+    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0

+    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1

+    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2

+    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3

+  };

+  int i, j;

+  int16_t out[4 * 4];

+  int16_t *outptr = out;

+  int16_t temp_in[4], temp_out[4];

+  // inverse transform row vectors

+  for (i = 0; i < 4; ++i) {

+    IHT_4[tx_type].rows(input, outptr);

+    input  += 4;

+    outptr += 4;

+  }

+  // inverse transform column vectors

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    IHT_4[tx_type].cols(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+  }

+}

+static void iadst8_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  int x0 = input[7];

+  int x1 = input[0];

+  int x2 = input[5];

+  int x3 = input[2];

+  int x4 = input[3];

+  int x5 = input[4];

+  int x6 = input[1];

+  int x7 = input[6];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = 0;

+    return;

+  }

+  // stage 1

+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

+  x0 = dct_const_round_shift(s0 + s4);

+  x1 = dct_const_round_shift(s1 + s5);

+  x2 = dct_const_round_shift(s2 + s6);

+  x3 = dct_const_round_shift(s3 + s7);

+  x4 = dct_const_round_shift(s0 - s4);

+  x5 = dct_const_round_shift(s1 - s5);

+  x6 = dct_const_round_shift(s2 - s6);

+  x7 = dct_const_round_shift(s3 - s7);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  output[0] =  x0;

+  output[1] = -x4;

+  output[2] =  x6;

+  output[3] = -x2;

+  output[4] =  x3;

+  output[5] = -x7;

+  output[6] =  x5;

+  output[7] = -x1;

+}

+static const transform_2d IHT_8[] = {

+  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0

+  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1

+  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2

+  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3

+};

+void vp9_short_iht8x8_c(int16_t *input, int16_t *output,

+                        int pitch, int tx_type) {

+  int i, j;

+  int16_t out[8 * 8];

+  int16_t *outptr = out;

+  int16_t temp_in[8], temp_out[8];

+  const transform_2d ht = IHT_8[tx_type];

+  // inverse transform row vectors

+  for (i = 0; i < 8; ++i) {

+    ht.rows(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // inverse transform column vectors

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+  }

+}

+void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[8 * 8];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[8], temp_out[8];

+  vpx_memset(out, 0, sizeof(out));

+  // First transform rows

+  // only first 4 row has non-zero coefs

+  for (i = 0; i < 4; ++i) {

+    idct8_1d(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    idct8_1d(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+  }

+}

+void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  output[0] = ROUND_POWER_OF_TWO(out, 5);

+}

+static void idct16_1d(int16_t *input, int16_t *output) {

+  int16_t step1[16], step2[16];

+  int temp1, temp2;

+  // stage 1

+  step1[0] = input[0/2];

+  step1[1] = input[16/2];

+  step1[2] = input[8/2];

+  step1[3] = input[24/2];

+  step1[4] = input[4/2];

+  step1[5] = input[20/2];

+  step1[6] = input[12/2];

+  step1[7] = input[28/2];

+  step1[8] = input[2/2];

+  step1[9] = input[18/2];

+  step1[10] = input[10/2];

+  step1[11] = input[26/2];

+  step1[12] = input[6/2];

+  step1[13] = input[22/2];

+  step1[14] = input[14/2];

+  step1[15] = input[30/2];

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = dct_const_round_shift(temp1);

+  step2[15] = dct_const_round_shift(temp2);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = dct_const_round_shift(temp1);

+  step1[7] = dct_const_round_shift(temp2);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[8] = step2[8] + step2[9];

+  step1[9] = step2[8] - step2[9];

+  step1[10] = -step2[10] + step2[11];

+  step1[11] = step2[10] + step2[11];

+  step1[12] = step2[12] + step2[13];

+  step1[13] = step2[12] - step2[13];

+  step1[14] = -step2[14] + step2[15];

+  step1[15] = step2[14] + step2[15];

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = dct_const_round_shift(temp1);

+  step2[1] = dct_const_round_shift(temp2);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = dct_const_round_shift(temp1);

+  step2[3] = dct_const_round_shift(temp2);

+  step2[4] = step1[4] + step1[5];

+  step2[5] = step1[4] - step1[5];

+  step2[6] = -step1[6] + step1[7];

+  step2[7] = step1[6] + step1[7];

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  // stage 5

+  step1[0] = step2[0] + step2[3];

+  step1[1] = step2[1] + step2[2];

+  step1[2] = step2[1] - step2[2];

+  step1[3] = step2[0] - step2[3];

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[7] = step2[7];

+  step1[8] = step2[8] + step2[11];

+  step1[9] = step2[9] + step2[10];

+  step1[10] = step2[9] - step2[10];

+  step1[11] = step2[8] - step2[11];

+  step1[12] = -step2[12] + step2[15];

+  step1[13] = -step2[13] + step2[14];

+  step1[14] = step2[13] + step2[14];

+  step1[15] = step2[12] + step2[15];

+  // stage 6

+  step2[0] = step1[0] + step1[7];

+  step2[1] = step1[1] + step1[6];

+  step2[2] = step1[2] + step1[5];

+  step2[3] = step1[3] + step1[4];

+  step2[4] = step1[3] - step1[4];

+  step2[5] = step1[2] - step1[5];

+  step2[6] = step1[1] - step1[6];

+  step2[7] = step1[0] - step1[7];

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  // stage 7

+  output[0] = step2[0] + step2[15];

+  output[1] = step2[1] + step2[14];

+  output[2] = step2[2] + step2[13];

+  output[3] = step2[3] + step2[12];

+  output[4] = step2[4] + step2[11];

+  output[5] = step2[5] + step2[10];

+  output[6] = step2[6] + step2[9];

+  output[7] = step2[7] + step2[8];

+  output[8] = step2[7] - step2[8];

+  output[9] = step2[6] - step2[9];

+  output[10] = step2[5] - step2[10];

+  output[11] = step2[4] - step2[11];

+  output[12] = step2[3] - step2[12];

+  output[13] = step2[2] - step2[13];

+  output[14] = step2[1] - step2[14];

+  output[15] = step2[0] - step2[15];

+}

+void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[16 * 16];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[16], temp_out[16];

+  // First transform rows

+  for (i = 0; i < 16; ++i) {

+    idct16_1d(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    idct16_1d(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

+void iadst16_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

+  int x0 = input[15];

+  int x1 = input[0];

+  int x2 = input[13];

+  int x3 = input[2];

+  int x4 = input[11];

+  int x5 = input[4];

+  int x6 = input[9];

+  int x7 = input[6];

+  int x8 = input[7];

+  int x9 = input[8];

+  int x10 = input[5];

+  int x11 = input[10];

+  int x12 = input[3];

+  int x13 = input[12];

+  int x14 = input[1];

+  int x15 = input[14];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = output[8]

+              = output[9] = output[10] = output[11] = output[12]

+              = output[13] = output[14] = output[15] = 0;

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = dct_const_round_shift(s0 + s8);

+  x1 = dct_const_round_shift(s1 + s9);

+  x2 = dct_const_round_shift(s2 + s10);

+  x3 = dct_const_round_shift(s3 + s11);

+  x4 = dct_const_round_shift(s4 + s12);

+  x5 = dct_const_round_shift(s5 + s13);

+  x6 = dct_const_round_shift(s6 + s14);

+  x7 = dct_const_round_shift(s7 + s15);

+  x8  = dct_const_round_shift(s0 - s8);

+  x9  = dct_const_round_shift(s1 - s9);

+  x10 = dct_const_round_shift(s2 - s10);

+  x11 = dct_const_round_shift(s3 - s11);

+  x12 = dct_const_round_shift(s4 - s12);

+  x13 = dct_const_round_shift(s5 - s13);

+  x14 = dct_const_round_shift(s6 - s14);

+  x15 = dct_const_round_shift(s7 - s15);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = s0 + s4;

+  x1 = s1 + s5;

+  x2 = s2 + s6;

+  x3 = s3 + s7;

+  x4 = s0 - s4;

+  x5 = s1 - s5;

+  x6 = s2 - s6;

+  x7 = s3 - s7;

+  x8 = dct_const_round_shift(s8 + s12);

+  x9 = dct_const_round_shift(s9 + s13);

+  x10 = dct_const_round_shift(s10 + s14);

+  x11 = dct_const_round_shift(s11 + s15);

+  x12 = dct_const_round_shift(s8 - s12);

+  x13 = dct_const_round_shift(s9 - s13);

+  x14 = dct_const_round_shift(s10 - s14);

+  x15 = dct_const_round_shift(s11 - s15);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  x8 = s8 + s10;

+  x9 = s9 + s11;

+  x10 = s8 - s10;

+  x11 = s9 - s11;

+  x12 = dct_const_round_shift(s12 + s14);

+  x13 = dct_const_round_shift(s13 + s15);

+  x14 = dct_const_round_shift(s12 - s14);

+  x15 = dct_const_round_shift(s13 - s15);

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (- x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (- x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  x10 = dct_const_round_shift(s10);

+  x11 = dct_const_round_shift(s11);

+  x14 = dct_const_round_shift(s14);

+  x15 = dct_const_round_shift(s15);

+  output[0] =  x0;

+  output[1] = -x8;

+  output[2] =  x12;

+  output[3] = -x4;

+  output[4] =  x6;

+  output[5] =  x14;

+  output[6] =  x10;

+  output[7] =  x2;

+  output[8] =  x3;

+  output[9] =  x11;

+  output[10] =  x15;

+  output[11] =  x7;

+  output[12] =  x5;

+  output[13] = -x13;

+  output[14] =  x9;

+  output[15] = -x1;

+}

+static const transform_2d IHT_16[] = {

+  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0

+  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1

+  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2

+  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3

+};

+void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

+                          int pitch, int tx_type) {

+  int i, j;

+  int16_t out[16 * 16];

+  int16_t *outptr = out;

+  int16_t temp_in[16], temp_out[16];

+  const transform_2d ht = IHT_16[tx_type];

+  // Rows

+  for (i = 0; i < 16; ++i) {

+    ht.rows(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

+  // Columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

+void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

+    int16_t out[16 * 16];

+    int16_t *outptr = out;

+    const int half_pitch = pitch >> 1;

+    int i, j;

+    int16_t temp_in[16], temp_out[16];

+    /* First transform rows. Since all non-zero dct coefficients are in

+     * upper-left 4x4 area, we only need to calculate first 4 rows here.

+     */

+    vpx_memset(out, 0, sizeof(out));

+    for (i = 0; i < 4; ++i) {

+      idct16_1d(input, outptr);

+      input += 16;

+      outptr += 16;

+    }

+    // Then transform columns

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = out[j*16 + i];

+      idct16_1d(temp_in, temp_out);

+      for (j = 0; j < 16; ++j)

+        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+    }

+}

+void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  output[0] = ROUND_POWER_OF_TWO(out, 6);

+}

+static void idct32_1d(int16_t *input, int16_t *output) {

+  int16_t step1[32], step2[32];

+  int temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[1] = input[16];

+  step1[2] = input[8];

+  step1[3] = input[24];

+  step1[4] = input[4];

+  step1[5] = input[20];

+  step1[6] = input[12];

+  step1[7] = input[28];

+  step1[8] = input[2];

+  step1[9] = input[18];

+  step1[10] = input[10];

+  step1[11] = input[26];

+  step1[12] = input[6];

+  step1[13] = input[22];

+  step1[14] = input[14];

+  step1[15] = input[30];

+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

+  step1[16] = dct_const_round_shift(temp1);

+  step1[31] = dct_const_round_shift(temp2);

+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

+  step1[17] = dct_const_round_shift(temp1);

+  step1[30] = dct_const_round_shift(temp2);

+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

+  step1[18] = dct_const_round_shift(temp1);

+  step1[29] = dct_const_round_shift(temp2);

+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

+  step1[19] = dct_const_round_shift(temp1);

+  step1[28] = dct_const_round_shift(temp2);

+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

+  step1[20] = dct_const_round_shift(temp1);

+  step1[27] = dct_const_round_shift(temp2);

+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

+  step1[22] = dct_const_round_shift(temp1);

+  step1[25] = dct_const_round_shift(temp2);

+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

+  step1[23] = dct_const_round_shift(temp1);

+  step1[24] = dct_const_round_shift(temp2);

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = dct_const_round_shift(temp1);

+  step2[15] = dct_const_round_shift(temp2);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  step2[16] = step1[16] + step1[17];

+  step2[17] = step1[16] - step1[17];

+  step2[18] = -step1[18] + step1[19];

+  step2[19] = step1[18] + step1[19];

+  step2[20] = step1[20] + step1[21];

+  step2[21] = step1[20] - step1[21];

+  step2[22] = -step1[22] + step1[23];

+  step2[23] = step1[22] + step1[23];

+  step2[24] = step1[24] + step1[25];

+  step2[25] = step1[24] - step1[25];

+  step2[26] = -step1[26] + step1[27];

+  step2[27] = step1[26] + step1[27];

+  step2[28] = step1[28] + step1[29];

+  step2[29] = step1[28] - step1[29];

+  step2[30] = -step1[30] + step1[31];

+  step2[31] = step1[30] + step1[31];

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = dct_const_round_shift(temp1);

+  step1[7] = dct_const_round_shift(temp2);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[8] = step2[8] + step2[9];

+  step1[9] = step2[8] - step2[9];

+  step1[10] = -step2[10] + step2[11];

+  step1[11] = step2[10] + step2[11];

+  step1[12] = step2[12] + step2[13];

+  step1[13] = step2[12] - step2[13];

+  step1[14] = -step2[14] + step2[15];

+  step1[15] = step2[14] + step2[15];

+  step1[16] = step2[16];

+  step1[31] = step2[31];

+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

+  step1[17] = dct_const_round_shift(temp1);

+  step1[30] = dct_const_round_shift(temp2);

+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

+  step1[18] = dct_const_round_shift(temp1);

+  step1[29] = dct_const_round_shift(temp2);

+  step1[19] = step2[19];

+  step1[20] = step2[20];

+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

+  step1[22] = dct_const_round_shift(temp1);

+  step1[25] = dct_const_round_shift(temp2);

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[27] = step2[27];

+  step1[28] = step2[28];

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = dct_const_round_shift(temp1);

+  step2[1] = dct_const_round_shift(temp2);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = dct_const_round_shift(temp1);

+  step2[3] = dct_const_round_shift(temp2);

+  step2[4] = step1[4] + step1[5];

+  step2[5] = step1[4] - step1[5];

+  step2[6] = -step1[6] + step1[7];

+  step2[7] = step1[6] + step1[7];

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  step2[16] = step1[16] + step1[19];

+  step2[17] = step1[17] + step1[18];

+  step2[18] = step1[17] - step1[18];

+  step2[19] = step1[16] - step1[19];

+  step2[20] = -step1[20] + step1[23];

+  step2[21] = -step1[21] + step1[22];

+  step2[22] = step1[21] + step1[22];

+  step2[23] = step1[20] + step1[23];

+  step2[24] = step1[24] + step1[27];

+  step2[25] = step1[25] + step1[26];

+  step2[26] = step1[25] - step1[26];

+  step2[27] = step1[24] - step1[27];

+  step2[28] = -step1[28] + step1[31];

+  step2[29] = -step1[29] + step1[30];

+  step2[30] = step1[29] + step1[30];

+  step2[31] = step1[28] + step1[31];

+  // stage 5

+  step1[0] = step2[0] + step2[3];

+  step1[1] = step2[1] + step2[2];

+  step1[2] = step2[1] - step2[2];

+  step1[3] = step2[0] - step2[3];

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[7] = step2[7];

+  step1[8] = step2[8] + step2[11];

+  step1[9] = step2[9] + step2[10];

+  step1[10] = step2[9] - step2[10];

+  step1[11] = step2[8] - step2[11];

+  step1[12] = -step2[12] + step2[15];

+  step1[13] = -step2[13] + step2[14];

+  step1[14] = step2[13] + step2[14];

+  step1[15] = step2[12] + step2[15];

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

+  step1[18] = dct_const_round_shift(temp1);

+  step1[29] = dct_const_round_shift(temp2);

+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

+  step1[19] = dct_const_round_shift(temp1);

+  step1[28] = dct_const_round_shift(temp2);

+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

+  step1[20] = dct_const_round_shift(temp1);

+  step1[27] = dct_const_round_shift(temp2);

+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  step1[22] = step2[22];

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[25] = step2[25];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // stage 6

+  step2[0] = step1[0] + step1[7];

+  step2[1] = step1[1] + step1[6];

+  step2[2] = step1[2] + step1[5];

+  step2[3] = step1[3] + step1[4];

+  step2[4] = step1[3] - step1[4];

+  step2[5] = step1[2] - step1[5];

+  step2[6] = step1[1] - step1[6];

+  step2[7] = step1[0] - step1[7];

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  step2[16] = step1[16] + step1[23];

+  step2[17] = step1[17] + step1[22];

+  step2[18] = step1[18] + step1[21];

+  step2[19] = step1[19] + step1[20];

+  step2[20] = step1[19] - step1[20];

+  step2[21] = step1[18] - step1[21];

+  step2[22] = step1[17] - step1[22];

+  step2[23] = step1[16] - step1[23];

+  step2[24] = -step1[24] + step1[31];

+  step2[25] = -step1[25] + step1[30];

+  step2[26] = -step1[26] + step1[29];

+  step2[27] = -step1[27] + step1[28];

+  step2[28] = step1[27] + step1[28];

+  step2[29] = step1[26] + step1[29];

+  step2[30] = step1[25] + step1[30];

+  step2[31] = step1[24] + step1[31];

+  // stage 7

+  step1[0] = step2[0] + step2[15];

+  step1[1] = step2[1] + step2[14];

+  step1[2] = step2[2] + step2[13];

+  step1[3] = step2[3] + step2[12];

+  step1[4] = step2[4] + step2[11];

+  step1[5] = step2[5] + step2[10];

+  step1[6] = step2[6] + step2[9];

+  step1[7] = step2[7] + step2[8];

+  step1[8] = step2[7] - step2[8];

+  step1[9] = step2[6] - step2[9];

+  step1[10] = step2[5] - step2[10];

+  step1[11] = step2[4] - step2[11];

+  step1[12] = step2[3] - step2[12];

+  step1[13] = step2[2] - step2[13];

+  step1[14] = step2[1] - step2[14];

+  step1[15] = step2[0] - step2[15];

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  step1[18] = step2[18];

+  step1[19] = step2[19];

+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

+  temp2 = (step2[20] + step2[27]) * cospi_16_64;

+  step1[20] = dct_const_round_shift(temp1);

+  step1[27] = dct_const_round_shift(temp2);

+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

+  temp2 = (step2[21] + step2[26]) * cospi_16_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

+  temp2 = (step2[22] + step2[25]) * cospi_16_64;

+  step1[22] = dct_const_round_shift(temp1);

+  step1[25] = dct_const_round_shift(temp2);

+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

+  temp2 = (step2[23] + step2[24]) * cospi_16_64;

+  step1[23] = dct_const_round_shift(temp1);

+  step1[24] = dct_const_round_shift(temp2);

+  step1[28] = step2[28];

+  step1[29] = step2[29];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // final stage

+  output[0] = step1[0] + step1[31];

+  output[1] = step1[1] + step1[30];

+  output[2] = step1[2] + step1[29];

+  output[3] = step1[3] + step1[28];

+  output[4] = step1[4] + step1[27];

+  output[5] = step1[5] + step1[26];

+  output[6] = step1[6] + step1[25];

+  output[7] = step1[7] + step1[24];

+  output[8] = step1[8] + step1[23];

+  output[9] = step1[9] + step1[22];

+  output[10] = step1[10] + step1[21];

+  output[11] = step1[11] + step1[20];

+  output[12] = step1[12] + step1[19];

+  output[13] = step1[13] + step1[18];

+  output[14] = step1[14] + step1[17];

+  output[15] = step1[15] + step1[16];

+  output[16] = step1[15] - step1[16];

+  output[17] = step1[14] - step1[17];

+  output[18] = step1[13] - step1[18];

+  output[19] = step1[12] - step1[19];

+  output[20] = step1[11] - step1[20];

+  output[21] = step1[10] - step1[21];

+  output[22] = step1[9] - step1[22];

+  output[23] = step1[8] - step1[23];

+  output[24] = step1[7] - step1[24];

+  output[25] = step1[6] - step1[25];

+  output[26] = step1[5] - step1[26];

+  output[27] = step1[4] - step1[27];

+  output[28] = step1[3] - step1[28];

+  output[29] = step1[2] - step1[29];

+  output[30] = step1[1] - step1[30];

+  output[31] = step1[0] - step1[31];

+}

+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[32 * 32];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[32], temp_out[32];

+  // Rows

+  for (i = 0; i < 32; ++i) {

+    idct32_1d(input, outptr);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    idct32_1d(temp_in, temp_out);

+    for (j = 0; j < 32; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

+void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  output[0] = ROUND_POWER_OF_TWO(out, 6);

+}

+void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[32 * 32];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[32], temp_out[32];

+  /* First transform rows. Since all non-zero dct coefficients are in

+   * upper-left 4x4 area, we only need to calculate first 4 rows here.

+   */

+  vpx_memset(out, 0, sizeof(out));

+  for (i = 0; i < 4; ++i) {

+    idct32_1d(input, outptr);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    idct32_1d(temp_in, temp_out);

+    for (j = 0; j < 32; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

--- a/vp9/common/vp9_idctllm.c

+++ /dev/null

@@ -1,1321 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-/****************************************************************************

- * Notes:

- *

- * This implementation makes use of 16 bit fixed point verio of two multiply

- * constants:

- *         1.   sqrt(2) * cos (pi/8)

- *         2.   sqrt(2) * sin (pi/8)

- * Becuase the first constant is bigger than 1, to maintain the same 16 bit

- * fixed point precision as the second one, we use a trick of

- *         x * a = x + x*(a-1)

- * so

- *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

- **************************************************************************/

-#include <assert.h>

-#include <math.h>

-#include "./vpx_config.h"

-#include "./vp9_rtcd.h"

-#include "vp9/common/vp9_systemdependent.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_idct.h"

-void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  int16_t *ip = input;

-  int16_t *op = output;

-  const int half_pitch = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;

-    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;

-    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;

-    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += half_pitch;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];

-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];

-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];

-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];

-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;

-    op[half_pitch * 1] = (c1 + d1) >> 1;

-    op[half_pitch * 2] = (a1 - b1) >> 1;

-    op[half_pitch * 3] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {

-  int i;

-  int16_t tmp[4];

-  int16_t *ip = in;

-  int16_t *op = tmp;

-  const int half_pitch = pitch >> 1;

-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[half_pitch * 0] = (ip[0] + 1) >> 1;

-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,

-                                 uint8_t *dst_ptr,

-                                 int pitch, int stride) {

-  int r, c;

-  int16_t dc = input_dc;

-  int16_t tmp[4 * 4];

-  vp9_short_inv_walsh4x4_1_x8_c(&dc, tmp, 4 << 1);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

-void vp9_idct4_1d_c(int16_t *input, int16_t *output) {

-  int16_t step[4];

-  int temp1, temp2;

-  // stage 1

-  temp1 = (input[0] + input[2]) * cospi_16_64;

-  temp2 = (input[0] - input[2]) * cospi_16_64;

-  step[0] = dct_const_round_shift(temp1);

-  step[1] = dct_const_round_shift(temp2);

-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

-  step[2] = dct_const_round_shift(temp1);

-  step[3] = dct_const_round_shift(temp2);

-  // stage 2

-  output[0] = step[0] + step[3];

-  output[1] = step[1] + step[2];

-  output[2] = step[1] - step[2];

-  output[3] = step[0] - step[3];

-}

-void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[4 * 4];

-  int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[4], temp_out[4];

-  // Rows

-  for (i = 0; i < 4; ++i) {

-    for (j = 0; j < 4; ++j)

-      temp_in[j] = input[j];

-    vp9_idct4_1d(temp_in, outptr);

-    input += 4;

-    outptr += 4;

-  }

-  // Columns

-  for (i = 0; i < 4; ++i) {

-    for (j = 0; j < 4; ++j)

-      temp_in[j] = out[j * 4 + i];

-    vp9_idct4_1d(temp_in, temp_out);

-    for (j = 0; j < 4; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

-  }

-}

-void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {

-  int i;

-  int a1;

-  int16_t *op = output;

-  const int half_pitch = pitch >> 1;

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  a1 = ROUND_POWER_OF_TWO(out, 4);

-  for (i = 0; i < 4; i++) {

-    op[0] = op[1] = op[2] = op[3] = a1;

-    op += half_pitch;

-  }

-}

-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,

-                            uint8_t *dst_ptr, int pitch, int stride) {

-  int a1;

-  int r, c;

-  int16_t out = dct_const_round_shift(input_dc * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  a1 = ROUND_POWER_OF_TWO(out, 4);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

-      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

-static void idct8_1d(int16_t *input, int16_t *output) {

-  int16_t step1[8], step2[8];

-  int temp1, temp2;

-  // stage 1

-  step1[0] = input[0];

-  step1[2] = input[4];

-  step1[1] = input[2];

-  step1[3] = input[6];

-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

-  step1[4] = dct_const_round_shift(temp1);

-  step1[7] = dct_const_round_shift(temp2);

-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

-  step1[5] = dct_const_round_shift(temp1);

-  step1[6] = dct_const_round_shift(temp2);

-  // stage 2 & stage 3 - even half

-  vp9_idct4_1d(step1, step1);

-  // stage 2 - odd half

-  step2[4] = step1[4] + step1[5];

-  step2[5] = step1[4] - step1[5];

-  step2[6] = -step1[6] + step1[7];

-  step2[7] = step1[6] + step1[7];

-  // stage 3 -odd half

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = dct_const_round_shift(temp1);

-  step1[6] = dct_const_round_shift(temp2);

-  step1[7] = step2[7];

-  // stage 4

-  output[0] = step1[0] + step1[7];

-  output[1] = step1[1] + step1[6];

-  output[2] = step1[2] + step1[5];

-  output[3] = step1[3] + step1[4];

-  output[4] = step1[3] - step1[4];

-  output[5] = step1[2] - step1[5];

-  output[6] = step1[1] - step1[6];

-  output[7] = step1[0] - step1[7];

-}

-void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[8 * 8];

-  int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[8], temp_out[8];

-  // Rows

-  for (i = 0; i < 8; ++i) {

-    idct8_1d(input, outptr);

-    input += 8;

-    outptr += 8;

-  }

-  // Columns

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    idct8_1d(temp_in, temp_out);

-    for (j = 0; j < 8; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

-  }

-}

-static void iadst4_1d(int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0 = input[0];

-  int x1 = input[1];

-  int x2 = input[2];

-  int x3 = input[3];

-  if (!(x0 | x1 | x2 | x3)) {

-    output[0] = output[1] = output[2] = output[3] = 0;

-    return;

-  }

-  s0 = sinpi_1_9 * x0;

-  s1 = sinpi_2_9 * x0;

-  s2 = sinpi_3_9 * x1;

-  s3 = sinpi_4_9 * x2;

-  s4 = sinpi_1_9 * x2;

-  s5 = sinpi_2_9 * x3;

-  s6 = sinpi_4_9 * x3;

-  s7 = x0 - x2 + x3;

-  x0 = s0 + s3 + s5;

-  x1 = s1 - s4 - s6;

-  x2 = sinpi_3_9 * s7;

-  x3 = s2;

-  s0 = x0 + x3;

-  s1 = x1 + x3;

-  s2 = x2;

-  s3 = x0 + x1 - x3;

-  // 1-D transform scaling factor is sqrt(2).

-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

-  // + 1b (addition) = 29b.

-  // Hence the output bit depth is 15b.

-  output[0] = dct_const_round_shift(s0);

-  output[1] = dct_const_round_shift(s1);

-  output[2] = dct_const_round_shift(s2);

-  output[3] = dct_const_round_shift(s3);

-}

-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

-                        int pitch, int tx_type) {

-  const transform_2d IHT_4[] = {

-    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0

-    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1

-    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2

-    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3

-  };

-  int i, j;

-  int16_t out[4 * 4];

-  int16_t *outptr = out;

-  int16_t temp_in[4], temp_out[4];

-  // inverse transform row vectors

-  for (i = 0; i < 4; ++i) {

-    IHT_4[tx_type].rows(input, outptr);

-    input  += 4;

-    outptr += 4;

-  }

-  // inverse transform column vectors

-  for (i = 0; i < 4; ++i) {

-    for (j = 0; j < 4; ++j)

-      temp_in[j] = out[j * 4 + i];

-    IHT_4[tx_type].cols(temp_in, temp_out);

-    for (j = 0; j < 4; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

-  }

-}

-static void iadst8_1d(int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0 = input[7];

-  int x1 = input[0];

-  int x2 = input[5];

-  int x3 = input[2];

-  int x4 = input[3];

-  int x5 = input[4];

-  int x6 = input[1];

-  int x7 = input[6];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

-    output[0] = output[1] = output[2] = output[3] = output[4]

-              = output[5] = output[6] = output[7] = 0;

-    return;

-  }

-  // stage 1

-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

-  x0 = dct_const_round_shift(s0 + s4);

-  x1 = dct_const_round_shift(s1 + s5);

-  x2 = dct_const_round_shift(s2 + s6);

-  x3 = dct_const_round_shift(s3 + s7);

-  x4 = dct_const_round_shift(s0 - s4);

-  x5 = dct_const_round_shift(s1 - s5);

-  x6 = dct_const_round_shift(s2 - s6);

-  x7 = dct_const_round_shift(s3 - s7);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

-  x0 = s0 + s2;

-  x1 = s1 + s3;

-  x2 = s0 - s2;

-  x3 = s1 - s3;

-  x4 = dct_const_round_shift(s4 + s6);

-  x5 = dct_const_round_shift(s5 + s7);

-  x6 = dct_const_round_shift(s4 - s6);

-  x7 = dct_const_round_shift(s5 - s7);

-  // stage 3

-  s2 = cospi_16_64 * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (x6 - x7);

-  x2 = dct_const_round_shift(s2);

-  x3 = dct_const_round_shift(s3);

-  x6 = dct_const_round_shift(s6);

-  x7 = dct_const_round_shift(s7);

-  output[0] =  x0;

-  output[1] = -x4;

-  output[2] =  x6;

-  output[3] = -x2;

-  output[4] =  x3;

-  output[5] = -x7;

-  output[6] =  x5;

-  output[7] = -x1;

-}

-static const transform_2d IHT_8[] = {

-  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0

-  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1

-  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2

-  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3

-};

-void vp9_short_iht8x8_c(int16_t *input, int16_t *output,

-                        int pitch, int tx_type) {

-  int i, j;

-  int16_t out[8 * 8];

-  int16_t *outptr = out;

-  int16_t temp_in[8], temp_out[8];

-  const transform_2d ht = IHT_8[tx_type];

-  // inverse transform row vectors

-  for (i = 0; i < 8; ++i) {

-    ht.rows(input, outptr);

-    input += 8;

-    outptr += 8;

-  }

-  // inverse transform column vectors

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    ht.cols(temp_in, temp_out);

-    for (j = 0; j < 8; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

-  }

-}

-void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[8 * 8];

-  int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[8], temp_out[8];

-  vpx_memset(out, 0, sizeof(out));

-  // First transform rows

-  // only first 4 row has non-zero coefs

-  for (i = 0; i < 4; ++i) {

-    idct8_1d(input, outptr);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    idct8_1d(temp_in, temp_out);

-    for (j = 0; j < 8; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

-  }

-}

-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  output[0] = ROUND_POWER_OF_TWO(out, 5);

-}

-static void idct16_1d(int16_t *input, int16_t *output) {

-  int16_t step1[16], step2[16];

-  int temp1, temp2;

-  // stage 1

-  step1[0] = input[0/2];

-  step1[1] = input[16/2];

-  step1[2] = input[8/2];

-  step1[3] = input[24/2];

-  step1[4] = input[4/2];

-  step1[5] = input[20/2];

-  step1[6] = input[12/2];

-  step1[7] = input[28/2];

-  step1[8] = input[2/2];

-  step1[9] = input[18/2];

-  step1[10] = input[10/2];

-  step1[11] = input[26/2];

-  step1[12] = input[6/2];

-  step1[13] = input[22/2];

-  step1[14] = input[14/2];

-  step1[15] = input[30/2];

-  // stage 2

-  step2[0] = step1[0];

-  step2[1] = step1[1];

-  step2[2] = step1[2];

-  step2[3] = step1[3];

-  step2[4] = step1[4];

-  step2[5] = step1[5];

-  step2[6] = step1[6];

-  step2[7] = step1[7];

-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

-  step2[8] = dct_const_round_shift(temp1);

-  step2[15] = dct_const_round_shift(temp2);

-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

-  step2[9] = dct_const_round_shift(temp1);

-  step2[14] = dct_const_round_shift(temp2);

-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

-  step2[10] = dct_const_round_shift(temp1);

-  step2[13] = dct_const_round_shift(temp2);

-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

-  step2[11] = dct_const_round_shift(temp1);

-  step2[12] = dct_const_round_shift(temp2);

-  // stage 3

-  step1[0] = step2[0];

-  step1[1] = step2[1];

-  step1[2] = step2[2];

-  step1[3] = step2[3];

-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

-  step1[4] = dct_const_round_shift(temp1);

-  step1[7] = dct_const_round_shift(temp2);

-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

-  step1[5] = dct_const_round_shift(temp1);

-  step1[6] = dct_const_round_shift(temp2);

-  step1[8] = step2[8] + step2[9];

-  step1[9] = step2[8] - step2[9];

-  step1[10] = -step2[10] + step2[11];

-  step1[11] = step2[10] + step2[11];

-  step1[12] = step2[12] + step2[13];

-  step1[13] = step2[12] - step2[13];

-  step1[14] = -step2[14] + step2[15];

-  step1[15] = step2[14] + step2[15];

-  temp1 = (step1[0] + step1[1]) * cospi_16_64;

-  temp2 = (step1[0] - step1[1]) * cospi_16_64;

-  step2[0] = dct_const_round_shift(temp1);

-  step2[1] = dct_const_round_shift(temp2);

-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

-  step2[2] = dct_const_round_shift(temp1);

-  step2[3] = dct_const_round_shift(temp2);

-  step2[4] = step1[4] + step1[5];

-  step2[5] = step1[4] - step1[5];

-  step2[6] = -step1[6] + step1[7];

-  step2[7] = step1[6] + step1[7];

-  step2[8] = step1[8];

-  step2[15] = step1[15];

-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

-  step2[9] = dct_const_round_shift(temp1);

-  step2[14] = dct_const_round_shift(temp2);

-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

-  step2[10] = dct_const_round_shift(temp1);

-  step2[13] = dct_const_round_shift(temp2);

-  step2[11] = step1[11];

-  step2[12] = step1[12];

-  // stage 5

-  step1[0] = step2[0] + step2[3];

-  step1[1] = step2[1] + step2[2];

-  step1[2] = step2[1] - step2[2];

-  step1[3] = step2[0] - step2[3];

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = dct_const_round_shift(temp1);

-  step1[6] = dct_const_round_shift(temp2);

-  step1[7] = step2[7];

-  step1[8] = step2[8] + step2[11];

-  step1[9] = step2[9] + step2[10];

-  step1[10] = step2[9] - step2[10];

-  step1[11] = step2[8] - step2[11];

-  step1[12] = -step2[12] + step2[15];

-  step1[13] = -step2[13] + step2[14];

-  step1[14] = step2[13] + step2[14];

-  step1[15] = step2[12] + step2[15];

-  // stage 6

-  step2[0] = step1[0] + step1[7];

-  step2[1] = step1[1] + step1[6];

-  step2[2] = step1[2] + step1[5];

-  step2[3] = step1[3] + step1[4];

-  step2[4] = step1[3] - step1[4];

-  step2[5] = step1[2] - step1[5];

-  step2[6] = step1[1] - step1[6];

-  step2[7] = step1[0] - step1[7];

-  step2[8] = step1[8];

-  step2[9] = step1[9];

-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

-  temp2 = (step1[10] + step1[13]) * cospi_16_64;

-  step2[10] = dct_const_round_shift(temp1);

-  step2[13] = dct_const_round_shift(temp2);

-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

-  temp2 = (step1[11] + step1[12]) * cospi_16_64;

-  step2[11] = dct_const_round_shift(temp1);

-  step2[12] = dct_const_round_shift(temp2);

-  step2[14] = step1[14];

-  step2[15] = step1[15];

-  // stage 7

-  output[0] = step2[0] + step2[15];

-  output[1] = step2[1] + step2[14];

-  output[2] = step2[2] + step2[13];

-  output[3] = step2[3] + step2[12];

-  output[4] = step2[4] + step2[11];

-  output[5] = step2[5] + step2[10];

-  output[6] = step2[6] + step2[9];

-  output[7] = step2[7] + step2[8];

-  output[8] = step2[7] - step2[8];

-  output[9] = step2[6] - step2[9];

-  output[10] = step2[5] - step2[10];

-  output[11] = step2[4] - step2[11];

-  output[12] = step2[3] - step2[12];

-  output[13] = step2[2] - step2[13];

-  output[14] = step2[1] - step2[14];

-  output[15] = step2[0] - step2[15];

-}

-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[16 * 16];

-  int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[16], temp_out[16];

-  // First transform rows

-  for (i = 0; i < 16; ++i) {

-    idct16_1d(input, outptr);

-    input += 16;

-    outptr += 16;

-  }

-  // Then transform columns

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j * 16 + i];

-    idct16_1d(temp_in, temp_out);

-    for (j = 0; j < 16; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-  }

-}

-void iadst16_1d(int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

-  int x0 = input[15];

-  int x1 = input[0];

-  int x2 = input[13];

-  int x3 = input[2];

-  int x4 = input[11];

-  int x5 = input[4];

-  int x6 = input[9];

-  int x7 = input[6];

-  int x8 = input[7];

-  int x9 = input[8];

-  int x10 = input[5];

-  int x11 = input[10];

-  int x12 = input[3];

-  int x13 = input[12];

-  int x14 = input[1];

-  int x15 = input[14];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

-    output[0] = output[1] = output[2] = output[3] = output[4]

-              = output[5] = output[6] = output[7] = output[8]

-              = output[9] = output[10] = output[11] = output[12]

-              = output[13] = output[14] = output[15] = 0;

-    return;

-  }

-  // stage 1

-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

-  x0 = dct_const_round_shift(s0 + s8);

-  x1 = dct_const_round_shift(s1 + s9);

-  x2 = dct_const_round_shift(s2 + s10);

-  x3 = dct_const_round_shift(s3 + s11);

-  x4 = dct_const_round_shift(s4 + s12);

-  x5 = dct_const_round_shift(s5 + s13);

-  x6 = dct_const_round_shift(s6 + s14);

-  x7 = dct_const_round_shift(s7 + s15);

-  x8  = dct_const_round_shift(s0 - s8);

-  x9  = dct_const_round_shift(s1 - s9);

-  x10 = dct_const_round_shift(s2 - s10);

-  x11 = dct_const_round_shift(s3 - s11);

-  x12 = dct_const_round_shift(s4 - s12);

-  x13 = dct_const_round_shift(s5 - s13);

-  x14 = dct_const_round_shift(s6 - s14);

-  x15 = dct_const_round_shift(s7 - s15);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4;

-  s5 = x5;

-  s6 = x6;

-  s7 = x7;

-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

-  x0 = s0 + s4;

-  x1 = s1 + s5;

-  x2 = s2 + s6;

-  x3 = s3 + s7;

-  x4 = s0 - s4;

-  x5 = s1 - s5;

-  x6 = s2 - s6;

-  x7 = s3 - s7;

-  x8 = dct_const_round_shift(s8 + s12);

-  x9 = dct_const_round_shift(s9 + s13);

-  x10 = dct_const_round_shift(s10 + s14);

-  x11 = dct_const_round_shift(s11 + s15);

-  x12 = dct_const_round_shift(s8 - s12);

-  x13 = dct_const_round_shift(s9 - s13);

-  x14 = dct_const_round_shift(s10 - s14);

-  x15 = dct_const_round_shift(s11 - s15);

-  // stage 3

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

-  s8 = x8;

-  s9 = x9;

-  s10 = x10;

-  s11 = x11;

-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

-  x0 = s0 + s2;

-  x1 = s1 + s3;

-  x2 = s0 - s2;

-  x3 = s1 - s3;

-  x4 = dct_const_round_shift(s4 + s6);

-  x5 = dct_const_round_shift(s5 + s7);

-  x6 = dct_const_round_shift(s4 - s6);

-  x7 = dct_const_round_shift(s5 - s7);

-  x8 = s8 + s10;

-  x9 = s9 + s11;

-  x10 = s8 - s10;

-  x11 = s9 - s11;

-  x12 = dct_const_round_shift(s12 + s14);

-  x13 = dct_const_round_shift(s13 + s15);

-  x14 = dct_const_round_shift(s12 - s14);

-  x15 = dct_const_round_shift(s13 - s15);

-  // stage 4

-  s2 = (- cospi_16_64) * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (- x6 + x7);

-  s10 = cospi_16_64 * (x10 + x11);

-  s11 = cospi_16_64 * (- x10 + x11);

-  s14 = (- cospi_16_64) * (x14 + x15);

-  s15 = cospi_16_64 * (x14 - x15);

-  x2 = dct_const_round_shift(s2);

-  x3 = dct_const_round_shift(s3);

-  x6 = dct_const_round_shift(s6);

-  x7 = dct_const_round_shift(s7);

-  x10 = dct_const_round_shift(s10);

-  x11 = dct_const_round_shift(s11);

-  x14 = dct_const_round_shift(s14);

-  x15 = dct_const_round_shift(s15);

-  output[0] =  x0;

-  output[1] = -x8;

-  output[2] =  x12;

-  output[3] = -x4;

-  output[4] =  x6;

-  output[5] =  x14;

-  output[6] =  x10;

-  output[7] =  x2;

-  output[8] =  x3;

-  output[9] =  x11;

-  output[10] =  x15;

-  output[11] =  x7;

-  output[12] =  x5;

-  output[13] = -x13;

-  output[14] =  x9;

-  output[15] = -x1;

-}

-static const transform_2d IHT_16[] = {

-  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0

-  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1

-  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2

-  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3

-};

-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

-                          int pitch, int tx_type) {

-  int i, j;

-  int16_t out[16 * 16];

-  int16_t *outptr = out;

-  int16_t temp_in[16], temp_out[16];

-  const transform_2d ht = IHT_16[tx_type];

-  // Rows

-  for (i = 0; i < 16; ++i) {

-    ht.rows(input, outptr);

-    input += 16;

-    outptr += 16;

-  }

-  // Columns

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j * 16 + i];

-    ht.cols(temp_in, temp_out);

-    for (j = 0; j < 16; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-  }

-}

-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

-    int16_t out[16 * 16];

-    int16_t *outptr = out;

-    const int half_pitch = pitch >> 1;

-    int i, j;

-    int16_t temp_in[16], temp_out[16];

-    /* First transform rows. Since all non-zero dct coefficients are in

-     * upper-left 4x4 area, we only need to calculate first 4 rows here.

-     */

-    vpx_memset(out, 0, sizeof(out));

-    for (i = 0; i < 4; ++i) {

-      idct16_1d(input, outptr);

-      input += 16;

-      outptr += 16;

-    }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      idct16_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-    }

-}

-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  output[0] = ROUND_POWER_OF_TWO(out, 6);

-}

-static void idct32_1d(int16_t *input, int16_t *output) {

-  int16_t step1[32], step2[32];

-  int temp1, temp2;

-  // stage 1

-  step1[0] = input[0];

-  step1[1] = input[16];

-  step1[2] = input[8];

-  step1[3] = input[24];

-  step1[4] = input[4];

-  step1[5] = input[20];

-  step1[6] = input[12];

-  step1[7] = input[28];

-  step1[8] = input[2];

-  step1[9] = input[18];

-  step1[10] = input[10];

-  step1[11] = input[26];

-  step1[12] = input[6];

-  step1[13] = input[22];

-  step1[14] = input[14];

-  step1[15] = input[30];

-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

-  step1[16] = dct_const_round_shift(temp1);

-  step1[31] = dct_const_round_shift(temp2);

-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

-  step1[17] = dct_const_round_shift(temp1);

-  step1[30] = dct_const_round_shift(temp2);

-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

-  step1[18] = dct_const_round_shift(temp1);

-  step1[29] = dct_const_round_shift(temp2);

-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

-  step1[19] = dct_const_round_shift(temp1);

-  step1[28] = dct_const_round_shift(temp2);

-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

-  step1[20] = dct_const_round_shift(temp1);

-  step1[27] = dct_const_round_shift(temp2);

-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

-  step1[21] = dct_const_round_shift(temp1);

-  step1[26] = dct_const_round_shift(temp2);

-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

-  step1[22] = dct_const_round_shift(temp1);

-  step1[25] = dct_const_round_shift(temp2);

-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

-  step1[23] = dct_const_round_shift(temp1);

-  step1[24] = dct_const_round_shift(temp2);

-  // stage 2

-  step2[0] = step1[0];

-  step2[1] = step1[1];

-  step2[2] = step1[2];

-  step2[3] = step1[3];

-  step2[4] = step1[4];

-  step2[5] = step1[5];

-  step2[6] = step1[6];

-  step2[7] = step1[7];

-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

-  step2[8] = dct_const_round_shift(temp1);

-  step2[15] = dct_const_round_shift(temp2);

-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

-  step2[9] = dct_const_round_shift(temp1);

-  step2[14] = dct_const_round_shift(temp2);

-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

-  step2[10] = dct_const_round_shift(temp1);

-  step2[13] = dct_const_round_shift(temp2);

-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

-  step2[11] = dct_const_round_shift(temp1);

-  step2[12] = dct_const_round_shift(temp2);

-  step2[16] = step1[16] + step1[17];

-  step2[17] = step1[16] - step1[17];

-  step2[18] = -step1[18] + step1[19];

-  step2[19] = step1[18] + step1[19];

-  step2[20] = step1[20] + step1[21];

-  step2[21] = step1[20] - step1[21];

-  step2[22] = -step1[22] + step1[23];

-  step2[23] = step1[22] + step1[23];

-  step2[24] = step1[24] + step1[25];

-  step2[25] = step1[24] - step1[25];

-  step2[26] = -step1[26] + step1[27];

-  step2[27] = step1[26] + step1[27];

-  step2[28] = step1[28] + step1[29];

-  step2[29] = step1[28] - step1[29];

-  step2[30] = -step1[30] + step1[31];

-  step2[31] = step1[30] + step1[31];

-  // stage 3

-  step1[0] = step2[0];

-  step1[1] = step2[1];

-  step1[2] = step2[2];

-  step1[3] = step2[3];

-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

-  step1[4] = dct_const_round_shift(temp1);

-  step1[7] = dct_const_round_shift(temp2);

-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

-  step1[5] = dct_const_round_shift(temp1);

-  step1[6] = dct_const_round_shift(temp2);

-  step1[8] = step2[8] + step2[9];

-  step1[9] = step2[8] - step2[9];

-  step1[10] = -step2[10] + step2[11];

-  step1[11] = step2[10] + step2[11];

-  step1[12] = step2[12] + step2[13];

-  step1[13] = step2[12] - step2[13];

-  step1[14] = -step2[14] + step2[15];

-  step1[15] = step2[14] + step2[15];

-  step1[16] = step2[16];

-  step1[31] = step2[31];

-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

-  step1[17] = dct_const_round_shift(temp1);

-  step1[30] = dct_const_round_shift(temp2);

-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

-  step1[18] = dct_const_round_shift(temp1);

-  step1[29] = dct_const_round_shift(temp2);

-  step1[19] = step2[19];

-  step1[20] = step2[20];

-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

-  step1[21] = dct_const_round_shift(temp1);

-  step1[26] = dct_const_round_shift(temp2);

-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

-  step1[22] = dct_const_round_shift(temp1);

-  step1[25] = dct_const_round_shift(temp2);

-  step1[23] = step2[23];

-  step1[24] = step2[24];

-  step1[27] = step2[27];

-  step1[28] = step2[28];

-  // stage 4

-  temp1 = (step1[0] + step1[1]) * cospi_16_64;

-  temp2 = (step1[0] - step1[1]) * cospi_16_64;

-  step2[0] = dct_const_round_shift(temp1);

-  step2[1] = dct_const_round_shift(temp2);

-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

-  step2[2] = dct_const_round_shift(temp1);

-  step2[3] = dct_const_round_shift(temp2);

-  step2[4] = step1[4] + step1[5];

-  step2[5] = step1[4] - step1[5];

-  step2[6] = -step1[6] + step1[7];

-  step2[7] = step1[6] + step1[7];

-  step2[8] = step1[8];

-  step2[15] = step1[15];

-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

-  step2[9] = dct_const_round_shift(temp1);

-  step2[14] = dct_const_round_shift(temp2);

-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

-  step2[10] = dct_const_round_shift(temp1);

-  step2[13] = dct_const_round_shift(temp2);

-  step2[11] = step1[11];

-  step2[12] = step1[12];

-  step2[16] = step1[16] + step1[19];

-  step2[17] = step1[17] + step1[18];

-  step2[18] = step1[17] - step1[18];

-  step2[19] = step1[16] - step1[19];

-  step2[20] = -step1[20] + step1[23];

-  step2[21] = -step1[21] + step1[22];

-  step2[22] = step1[21] + step1[22];

-  step2[23] = step1[20] + step1[23];

-  step2[24] = step1[24] + step1[27];

-  step2[25] = step1[25] + step1[26];

-  step2[26] = step1[25] - step1[26];

-  step2[27] = step1[24] - step1[27];

-  step2[28] = -step1[28] + step1[31];

-  step2[29] = -step1[29] + step1[30];

-  step2[30] = step1[29] + step1[30];

-  step2[31] = step1[28] + step1[31];

-  // stage 5

-  step1[0] = step2[0] + step2[3];

-  step1[1] = step2[1] + step2[2];

-  step1[2] = step2[1] - step2[2];

-  step1[3] = step2[0] - step2[3];

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = dct_const_round_shift(temp1);

-  step1[6] = dct_const_round_shift(temp2);

-  step1[7] = step2[7];

-  step1[8] = step2[8] + step2[11];

-  step1[9] = step2[9] + step2[10];

-  step1[10] = step2[9] - step2[10];

-  step1[11] = step2[8] - step2[11];

-  step1[12] = -step2[12] + step2[15];

-  step1[13] = -step2[13] + step2[14];

-  step1[14] = step2[13] + step2[14];

-  step1[15] = step2[12] + step2[15];

-  step1[16] = step2[16];

-  step1[17] = step2[17];

-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

-  step1[18] = dct_const_round_shift(temp1);

-  step1[29] = dct_const_round_shift(temp2);

-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

-  step1[19] = dct_const_round_shift(temp1);

-  step1[28] = dct_const_round_shift(temp2);

-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

-  step1[20] = dct_const_round_shift(temp1);

-  step1[27] = dct_const_round_shift(temp2);

-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

-  step1[21] = dct_const_round_shift(temp1);

-  step1[26] = dct_const_round_shift(temp2);

-  step1[22] = step2[22];

-  step1[23] = step2[23];

-  step1[24] = step2[24];

-  step1[25] = step2[25];

-  step1[30] = step2[30];

-  step1[31] = step2[31];

-  // stage 6

-  step2[0] = step1[0] + step1[7];

-  step2[1] = step1[1] + step1[6];

-  step2[2] = step1[2] + step1[5];

-  step2[3] = step1[3] + step1[4];

-  step2[4] = step1[3] - step1[4];

-  step2[5] = step1[2] - step1[5];

-  step2[6] = step1[1] - step1[6];

-  step2[7] = step1[0] - step1[7];

-  step2[8] = step1[8];

-  step2[9] = step1[9];

-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

-  temp2 = (step1[10] + step1[13]) * cospi_16_64;

-  step2[10] = dct_const_round_shift(temp1);

-  step2[13] = dct_const_round_shift(temp2);

-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

-  temp2 = (step1[11] + step1[12]) * cospi_16_64;

-  step2[11] = dct_const_round_shift(temp1);

-  step2[12] = dct_const_round_shift(temp2);

-  step2[14] = step1[14];

-  step2[15] = step1[15];

-  step2[16] = step1[16] + step1[23];

-  step2[17] = step1[17] + step1[22];

-  step2[18] = step1[18] + step1[21];

-  step2[19] = step1[19] + step1[20];

-  step2[20] = step1[19] - step1[20];

-  step2[21] = step1[18] - step1[21];

-  step2[22] = step1[17] - step1[22];

-  step2[23] = step1[16] - step1[23];

-  step2[24] = -step1[24] + step1[31];

-  step2[25] = -step1[25] + step1[30];

-  step2[26] = -step1[26] + step1[29];

-  step2[27] = -step1[27] + step1[28];

-  step2[28] = step1[27] + step1[28];

-  step2[29] = step1[26] + step1[29];

-  step2[30] = step1[25] + step1[30];

-  step2[31] = step1[24] + step1[31];

-  // stage 7

-  step1[0] = step2[0] + step2[15];

-  step1[1] = step2[1] + step2[14];

-  step1[2] = step2[2] + step2[13];

-  step1[3] = step2[3] + step2[12];

-  step1[4] = step2[4] + step2[11];

-  step1[5] = step2[5] + step2[10];

-  step1[6] = step2[6] + step2[9];

-  step1[7] = step2[7] + step2[8];

-  step1[8] = step2[7] - step2[8];

-  step1[9] = step2[6] - step2[9];

-  step1[10] = step2[5] - step2[10];

-  step1[11] = step2[4] - step2[11];

-  step1[12] = step2[3] - step2[12];

-  step1[13] = step2[2] - step2[13];

-  step1[14] = step2[1] - step2[14];

-  step1[15] = step2[0] - step2[15];

-  step1[16] = step2[16];

-  step1[17] = step2[17];

-  step1[18] = step2[18];

-  step1[19] = step2[19];

-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

-  temp2 = (step2[20] + step2[27]) * cospi_16_64;

-  step1[20] = dct_const_round_shift(temp1);

-  step1[27] = dct_const_round_shift(temp2);

-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

-  temp2 = (step2[21] + step2[26]) * cospi_16_64;

-  step1[21] = dct_const_round_shift(temp1);

-  step1[26] = dct_const_round_shift(temp2);

-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

-  temp2 = (step2[22] + step2[25]) * cospi_16_64;

-  step1[22] = dct_const_round_shift(temp1);

-  step1[25] = dct_const_round_shift(temp2);

-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

-  temp2 = (step2[23] + step2[24]) * cospi_16_64;

-  step1[23] = dct_const_round_shift(temp1);

-  step1[24] = dct_const_round_shift(temp2);

-  step1[28] = step2[28];

-  step1[29] = step2[29];

-  step1[30] = step2[30];

-  step1[31] = step2[31];

-  // final stage

-  output[0] = step1[0] + step1[31];

-  output[1] = step1[1] + step1[30];

-  output[2] = step1[2] + step1[29];

-  output[3] = step1[3] + step1[28];

-  output[4] = step1[4] + step1[27];

-  output[5] = step1[5] + step1[26];

-  output[6] = step1[6] + step1[25];

-  output[7] = step1[7] + step1[24];

-  output[8] = step1[8] + step1[23];

-  output[9] = step1[9] + step1[22];

-  output[10] = step1[10] + step1[21];

-  output[11] = step1[11] + step1[20];

-  output[12] = step1[12] + step1[19];

-  output[13] = step1[13] + step1[18];

-  output[14] = step1[14] + step1[17];

-  output[15] = step1[15] + step1[16];

-  output[16] = step1[15] - step1[16];

-  output[17] = step1[14] - step1[17];

-  output[18] = step1[13] - step1[18];

-  output[19] = step1[12] - step1[19];

-  output[20] = step1[11] - step1[20];

-  output[21] = step1[10] - step1[21];

-  output[22] = step1[9] - step1[22];

-  output[23] = step1[8] - step1[23];

-  output[24] = step1[7] - step1[24];

-  output[25] = step1[6] - step1[25];

-  output[26] = step1[5] - step1[26];

-  output[27] = step1[4] - step1[27];

-  output[28] = step1[3] - step1[28];

-  output[29] = step1[2] - step1[29];

-  output[30] = step1[1] - step1[30];

-  output[31] = step1[0] - step1[31];

-}

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[32 * 32];

-  int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[32], temp_out[32];

-  // Rows

-  for (i = 0; i < 32; ++i) {

-    idct32_1d(input, outptr);

-    input += 32;

-    outptr += 32;

-  }

-  // Columns

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 32; ++j)

-      temp_in[j] = out[j * 32 + i];

-    idct32_1d(temp_in, temp_out);

-    for (j = 0; j < 32; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-  }

-}

-void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  output[0] = ROUND_POWER_OF_TWO(out, 6);

-}

-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[32 * 32];

-  int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[32], temp_out[32];

-  /* First transform rows. Since all non-zero dct coefficients are in

-   * upper-left 4x4 area, we only need to calculate first 4 rows here.

-   */

-  vpx_memset(out, 0, sizeof(out));

-  for (i = 0; i < 4; ++i) {

-    idct32_1d(input, outptr);

-    input += 32;

-    outptr += 32;

-  }

-  // Columns

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 32; ++j)

-      temp_in[j] = out[j * 32 + i];

-    idct32_1d(temp_in, temp_out);

-    for (j = 0; j < 32; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-  }

-}

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -279,11 +279,11 @@

 # dct

-prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4llm_1

+prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_idct4x4_1

-prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4llm sse2

+prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_idct4x4 sse2

 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"

 specialize vp9_short_idct8x8

@@ -330,10 +330,10 @@

 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

 specialize vp9_dc_only_idct_add sse2

-prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_inv_walsh4x4_1_x8

-prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_inv_walsh4x4_x8

+prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_iwalsh4x4_1

+prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_iwalsh4x4

 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

 specialize vp9_dc_only_inv_walsh_add

@@ -600,11 +600,11 @@

 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"

 specialize vp9_short_fdct16x16

-prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_walsh4x4_x8

+prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"

+specialize vp9_short_walsh4x4

-prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_walsh8x4_x8

+prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch"

+specialize vp9_short_walsh8x4

 # Motion search

--- /dev/null

+++ b/vp9/common/x86/vp9_idct_sse2.asm

@@ -1,0 +1,712 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_idct_dequant_0_2x_sse2

+; (

+;   short *qcoeff       - 0

+;   short *dequant      - 1

+;   unsigned char *pre  - 2

+;   unsigned char *dst  - 3

+;   int dst_stride      - 4

+;   int blk_stride      - 5

+; )

+global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_0_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    ; end prolog

+        mov         rdx,            arg(1) ; dequant

+        mov         rax,            arg(0) ; qcoeff

+        movd        xmm4,           [rax]

+        movd        xmm5,           [rdx]

+        pinsrw      xmm4,           [rax+32],   4

+        pinsrw      xmm5,           [rdx],      4

+        pmullw      xmm4,           xmm5

+    ; Zero out xmm5, for use unpacking

+        pxor        xmm5,           xmm5

+    ; clear coeffs

+        movd        [rax],          xmm5

+        movd        [rax+32],       xmm5

+;pshufb

+        pshuflw     xmm4,           xmm4,       00000000b

+        pshufhw     xmm4,           xmm4,       00000000b

+        mov         rax,            arg(2) ; pre

+        paddw       xmm4,           [GLOBAL(fours)]

+        movsxd      rcx,            dword ptr arg(5) ; blk_stride

+        psraw       xmm4,           3

+        movq        xmm0,           [rax]

+        movq        xmm1,           [rax+rcx]

+        movq        xmm2,           [rax+2*rcx]

+        lea         rcx,            [3*rcx]

+        movq        xmm3,           [rax+rcx]

+        punpcklbw   xmm0,           xmm5

+        punpcklbw   xmm1,           xmm5

+        punpcklbw   xmm2,           xmm5

+        punpcklbw   xmm3,           xmm5

+        mov         rax,            arg(3) ; dst

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; Add to predict buffer

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm4

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm4

+    ; pack up before storing

+        packuswb    xmm0,           xmm5

+        packuswb    xmm1,           xmm5

+        packuswb    xmm2,           xmm5

+        packuswb    xmm3,           xmm5

+    ; store blocks back out

+        movq        [rax],          xmm0

+        movq        [rax + rdx],    xmm1

+        lea         rax,            [rax + 2*rdx]

+        movq        [rax],          xmm2

+        movq        [rax + rdx],    xmm3

+    ; begin epilog

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_full_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+        movsxd      rcx,            dword ptr arg(5) ; blk_stride

+    ; Zero out xmm7, for use unpacking

+        pxor        xmm7,           xmm7

+        mov         rdx,            arg(1)  ; dequant

+    ; note the transpose of xmm1 and xmm2, necessary for shuffle

+    ;   to spit out sensicle data

+        movdqa      xmm0,           [rax]

+        movdqa      xmm2,           [rax+16]

+        movdqa      xmm1,           [rax+32]

+        movdqa      xmm3,           [rax+48]

+    ; Clear out coeffs

+        movdqa      [rax],          xmm7

+        movdqa      [rax+16],       xmm7

+        movdqa      [rax+32],       xmm7

+        movdqa      [rax+48],       xmm7

+    ; dequantize qcoeff buffer

+        pmullw      xmm0,           [rdx]

+        pmullw      xmm2,           [rdx+16]

+        pmullw      xmm1,           [rdx]

+        pmullw      xmm3,           [rdx+16]

+    ; repack so block 0 row x and block 1 row x are together

+        movdqa      xmm4,           xmm0

+        punpckldq   xmm0,           xmm1

+        punpckhdq   xmm4,           xmm1

+        pshufd      xmm0,           xmm0,       11011000b

+        pshufd      xmm1,           xmm4,       11011000b

+        movdqa      xmm4,           xmm2

+        punpckldq   xmm2,           xmm3

+        punpckhdq   xmm4,           xmm3

+        pshufd      xmm2,           xmm2,       11011000b

+        pshufd      xmm3,           xmm4,       11011000b

+    ; first pass

+        psubw       xmm0,           xmm2        ; b1 = 0-2

+        paddw       xmm2,           xmm2        ;

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0        ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5        ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5        ; d1

+        movdqa      xmm6,           xmm2        ; a1

+        movdqa      xmm4,           xmm0        ; b1

+        paddw       xmm2,           xmm3        ;0

+        paddw       xmm4,           xmm7        ;1

+        psubw       xmm0,           xmm7        ;2

+        psubw       xmm6,           xmm3        ;3

+    ; transpose for the second pass

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+    ; second pass

+        psubw       xmm0,           xmm2            ; b1 = 0-2

+        paddw       xmm2,           xmm2

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0            ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5            ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5            ; d1

+        paddw       xmm0,           [GLOBAL(fours)]

+        paddw       xmm2,           [GLOBAL(fours)]

+        movdqa      xmm6,           xmm2            ; a1

+        movdqa      xmm4,           xmm0            ; b1

+        paddw       xmm2,           xmm3            ;0

+        paddw       xmm4,           xmm7            ;1

+        psubw       xmm0,           xmm7            ;2

+        psubw       xmm6,           xmm3            ;3

+        psraw       xmm2,           3

+        psraw       xmm0,           3

+        psraw       xmm4,           3

+        psraw       xmm6,           3

+    ; transpose to save

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+        pxor        xmm7,           xmm7

+    ; Load up predict blocks

+        movq        xmm4,           [rsi]

+        movq        xmm5,           [rsi+rcx]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm5

+        movq        xmm4,           [rsi+2*rcx]

+        lea         rcx,            [3*rcx]

+        movq        xmm5,           [rsi+rcx]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm5

+.finish:

+    ; pack up before storing

+        packuswb    xmm0,           xmm7

+        packuswb    xmm1,           xmm7

+        packuswb    xmm2,           xmm7

+        packuswb    xmm3,           xmm7

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_idct_dequant_dc_0_2x_sse2

+; (

+;   short *qcoeff       - 0

+;   short *dequant      - 1

+;   unsigned char *pre  - 2

+;   unsigned char *dst  - 3

+;   int dst_stride      - 4

+;   short *dc           - 5

+; )

+global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_dc_0_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+        mov         rdx,            arg(5) ; dc

+    ; Zero out xmm5, for use unpacking

+        pxor        xmm5,           xmm5

+    ; load up 2 dc words here == 2*16 = doubleword

+        movd        xmm4,           [rdx]

+    ; Load up predict blocks

+        movq        xmm0,           [rsi]

+        movq        xmm1,           [rsi+16]

+        movq        xmm2,           [rsi+32]

+        movq        xmm3,           [rsi+48]

+    ; Duplicate and expand dc across

+        punpcklwd   xmm4,           xmm4

+        punpckldq   xmm4,           xmm4

+    ; Rounding to dequant and downshift

+        paddw       xmm4,           [GLOBAL(fours)]

+        psraw       xmm4,           3

+    ; Predict buffer needs to be expanded from bytes to words

+        punpcklbw   xmm0,           xmm5

+        punpcklbw   xmm1,           xmm5

+        punpcklbw   xmm2,           xmm5

+        punpcklbw   xmm3,           xmm5

+    ; Add to predict buffer

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm4

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm4

+    ; pack up before storing

+        packuswb    xmm0,           xmm5

+        packuswb    xmm1,           xmm5

+        packuswb    xmm2,           xmm5

+        packuswb    xmm3,           xmm5

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_dc_full_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+    ; Zero out xmm7, for use unpacking

+        pxor        xmm7,           xmm7

+        mov         rdx,            arg(1)  ; dequant

+    ; note the transpose of xmm1 and xmm2, necessary for shuffle

+    ;   to spit out sensicle data

+        movdqa      xmm0,           [rax]

+        movdqa      xmm2,           [rax+16]

+        movdqa      xmm1,           [rax+32]

+        movdqa      xmm3,           [rax+48]

+    ; Clear out coeffs

+        movdqa      [rax],          xmm7

+        movdqa      [rax+16],       xmm7

+        movdqa      [rax+32],       xmm7

+        movdqa      [rax+48],       xmm7

+    ; dequantize qcoeff buffer

+        pmullw      xmm0,           [rdx]

+        pmullw      xmm2,           [rdx+16]

+        pmullw      xmm1,           [rdx]

+        pmullw      xmm3,           [rdx+16]

+    ; DC component

+        mov         rdx,            arg(5)

+    ; repack so block 0 row x and block 1 row x are together

+        movdqa      xmm4,           xmm0

+        punpckldq   xmm0,           xmm1

+        punpckhdq   xmm4,           xmm1

+        pshufd      xmm0,           xmm0,       11011000b

+        pshufd      xmm1,           xmm4,       11011000b

+        movdqa      xmm4,           xmm2

+        punpckldq   xmm2,           xmm3

+        punpckhdq   xmm4,           xmm3

+        pshufd      xmm2,           xmm2,       11011000b

+        pshufd      xmm3,           xmm4,       11011000b

+    ; insert DC component

+        pinsrw      xmm0,           [rdx],      0

+        pinsrw      xmm0,           [rdx+2],    4

+    ; first pass

+        psubw       xmm0,           xmm2        ; b1 = 0-2

+        paddw       xmm2,           xmm2        ;

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0        ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5        ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5        ; d1

+        movdqa      xmm6,           xmm2        ; a1

+        movdqa      xmm4,           xmm0        ; b1

+        paddw       xmm2,           xmm3        ;0

+        paddw       xmm4,           xmm7        ;1

+        psubw       xmm0,           xmm7        ;2

+        psubw       xmm6,           xmm3        ;3

+    ; transpose for the second pass

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+    ; second pass

+        psubw       xmm0,           xmm2            ; b1 = 0-2

+        paddw       xmm2,           xmm2

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0            ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5            ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5            ; d1

+        paddw       xmm0,           [GLOBAL(fours)]

+        paddw       xmm2,           [GLOBAL(fours)]

+        movdqa      xmm6,           xmm2            ; a1

+        movdqa      xmm4,           xmm0            ; b1

+        paddw       xmm2,           xmm3            ;0

+        paddw       xmm4,           xmm7            ;1

+        psubw       xmm0,           xmm7            ;2

+        psubw       xmm6,           xmm3            ;3

+        psraw       xmm2,           3

+        psraw       xmm0,           3

+        psraw       xmm4,           3

+        psraw       xmm6,           3

+    ; transpose to save

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+        pxor        xmm7,           xmm7

+    ; Load up predict blocks

+        movq        xmm4,           [rsi]

+        movq        xmm5,           [rsi+16]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm5

+        movq        xmm4,           [rsi+32]

+        movq        xmm5,           [rsi+48]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm5

+.finish:

+    ; pack up before storing

+        packuswb    xmm0,           xmm7

+        packuswb    xmm1,           xmm7

+        packuswb    xmm2,           xmm7

+        packuswb    xmm3,           xmm7

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+fours:

+    times 8 dw 0x0004

+align 16

+x_s1sqr2:

+    times 8 dw 0x8A8C

+align 16

+x_c1sqr2less1:

+    times 8 dw 0x4E7B

--- /dev/null

+++ b/vp9/common/x86/vp9_idct_x86.c

@@ -1,0 +1,237 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vp9/common/vp9_common.h"

+#include "vp9/common/vp9_idct.h"

+#if HAVE_SSE2

+// In order to improve performance, clip absolute diff values to [0, 255],

+// which allows to keep the additions/subtractions in 8 bits.

+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,

+                               uint8_t *dst_ptr, int pitch, int stride) {

+  int a1;

+  int16_t out;

+  uint8_t abs_diff;

+  __m128i p0, p1, p2, p3;

+  unsigned int extended_diff;

+  __m128i diff;

+  out = dct_const_round_shift(input_dc * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  // Read prediction data.

+  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));

+  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));

+  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));

+  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));

+  // Unpack prediction data, and store 4x4 array in 1 XMM register.

+  p0 = _mm_unpacklo_epi32(p0, p1);

+  p2 = _mm_unpacklo_epi32(p2, p3);

+  p0 = _mm_unpacklo_epi64(p0, p2);

+  // Clip dc value to [0, 255] range. Then, do addition or subtraction

+  // according to its sign.

+  if (a1 >= 0) {

+    abs_diff = (a1 > 255) ? 255 : a1;

+    extended_diff = abs_diff * 0x01010101u;

+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);

+    p1 = _mm_adds_epu8(p0, diff);

+  } else {

+    abs_diff = (a1 < -255) ? 255 : -a1;

+    extended_diff = abs_diff * 0x01010101u;

+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);

+    p1 = _mm_subs_epu8(p0, diff);

+  }

+  // Store results to dst.

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+  dst_ptr += stride;

+  p1 = _mm_srli_si128(p1, 4);

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+  dst_ptr += stride;

+  p1 = _mm_srli_si128(p1, 4);

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+  dst_ptr += stride;

+  p1 = _mm_srli_si128(p1, 4);

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+}

+void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i eight = _mm_set1_epi16(8);

+  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const int half_pitch = pitch >> 1;

+  __m128i input0, input1, input2, input3;

+  // Rows

+  input0 = _mm_loadl_epi64((__m128i *)input);

+  input1 = _mm_loadl_epi64((__m128i *)(input + 4));

+  input2 = _mm_loadl_epi64((__m128i *)(input + 8));

+  input3 = _mm_loadl_epi64((__m128i *)(input + 12));

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  input0 = _mm_shufflelo_epi16(input0, 0xd8);

+  input1 = _mm_shufflelo_epi16(input1, 0xd8);

+  input2 = _mm_shufflelo_epi16(input2, 0xd8);

+  input3 = _mm_shufflelo_epi16(input3, 0xd8);

+  input0 = _mm_unpacklo_epi32(input0, input0);

+  input1 = _mm_unpacklo_epi32(input1, input1);

+  input2 = _mm_unpacklo_epi32(input2, input2);

+  input3 = _mm_unpacklo_epi32(input3, input3);

+  // Stage 1

+  input0 = _mm_madd_epi16(input0, cst);

+  input1 = _mm_madd_epi16(input1, cst);

+  input2 = _mm_madd_epi16(input2, cst);

+  input3 = _mm_madd_epi16(input3, cst);

+  input0 = _mm_add_epi32(input0, rounding);

+  input1 = _mm_add_epi32(input1, rounding);

+  input2 = _mm_add_epi32(input2, rounding);

+  input3 = _mm_add_epi32(input3, rounding);

+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

+  // Stage 2

+  input0 = _mm_packs_epi32(input0, zero);

+  input1 = _mm_packs_epi32(input1, zero);

+  input2 = _mm_packs_epi32(input2, zero);

+  input3 = _mm_packs_epi32(input3, zero);

+  // Transpose

+  input1 = _mm_unpacklo_epi16(input0, input1);

+  input3 = _mm_unpacklo_epi16(input2, input3);

+  input0 = _mm_unpacklo_epi32(input1, input3);

+  input1 = _mm_unpackhi_epi32(input1, input3);

+  // Switch column2, column 3, and then, we got:

+  // input2: column1, column 0;  input3: column2, column 3.

+  input1 = _mm_shuffle_epi32(input1, 0x4e);

+  input2 = _mm_add_epi16(input0, input1);

+  input3 = _mm_sub_epi16(input0, input1);

+  // Columns

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  input0 = _mm_shufflelo_epi16(input2, 0xd8);

+  input1 = _mm_shufflehi_epi16(input2, 0xd8);

+  input2 = _mm_shufflehi_epi16(input3, 0xd8);

+  input3 = _mm_shufflelo_epi16(input3, 0xd8);

+  input0 = _mm_unpacklo_epi32(input0, input0);

+  input1 = _mm_unpackhi_epi32(input1, input1);

+  input2 = _mm_unpackhi_epi32(input2, input2);

+  input3 = _mm_unpacklo_epi32(input3, input3);

+  // Stage 1

+  input0 = _mm_madd_epi16(input0, cst);

+  input1 = _mm_madd_epi16(input1, cst);

+  input2 = _mm_madd_epi16(input2, cst);

+  input3 = _mm_madd_epi16(input3, cst);

+  input0 = _mm_add_epi32(input0, rounding);

+  input1 = _mm_add_epi32(input1, rounding);

+  input2 = _mm_add_epi32(input2, rounding);

+  input3 = _mm_add_epi32(input3, rounding);

+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

+  // Stage 2

+  input0 = _mm_packs_epi32(input0, zero);

+  input1 = _mm_packs_epi32(input1, zero);

+  input2 = _mm_packs_epi32(input2, zero);

+  input3 = _mm_packs_epi32(input3, zero);

+  // Transpose

+  input1 = _mm_unpacklo_epi16(input0, input1);

+  input3 = _mm_unpacklo_epi16(input2, input3);

+  input0 = _mm_unpacklo_epi32(input1, input3);

+  input1 = _mm_unpackhi_epi32(input1, input3);

+  // Switch column2, column 3, and then, we got:

+  // input2: column1, column 0;  input3: column2, column 3.

+  input1 = _mm_shuffle_epi32(input1, 0x4e);

+  input2 = _mm_add_epi16(input0, input1);

+  input3 = _mm_sub_epi16(input0, input1);

+  // Final round and shift

+  input2 = _mm_add_epi16(input2, eight);

+  input3 = _mm_add_epi16(input3, eight);

+  input2 = _mm_srai_epi16(input2, 4);

+  input3 = _mm_srai_epi16(input3, 4);

+  // Store results

+  _mm_storel_epi64((__m128i *)output, input2);

+  input2 = _mm_srli_si128(input2, 8);

+  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);

+  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);

+  input3 = _mm_srli_si128(input3, 8);

+  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);

+}

+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

+  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i in, temp;

+  // Load input data.

+  in = _mm_loadl_epi64((__m128i *)input);

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  in = _mm_shufflelo_epi16(in, 0xd8);

+  in = _mm_unpacklo_epi32(in, in);

+  // Stage 1

+  in = _mm_madd_epi16(in, c1);

+  in = _mm_add_epi32(in, rounding);

+  in = _mm_srai_epi32(in, DCT_CONST_BITS);

+  in = _mm_packs_epi32(in, zero);

+  // Stage 2

+  temp = _mm_shufflelo_epi16(in, 0x9c);

+  in = _mm_shufflelo_epi16(in, 0xc9);

+  in = _mm_unpacklo_epi64(temp, in);

+  in = _mm_madd_epi16(in, c2);

+  in = _mm_packs_epi32(in, zero);

+  // Store results

+  _mm_storel_epi64((__m128i *)output, in);

+}

+#endif

--- a/vp9/common/x86/vp9_idctllm_sse2.asm

+++ /dev/null

@@ -1,712 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_idct_dequant_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   int blk_stride      - 5

-; )

-global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    ; end prolog

-        mov         rdx,            arg(1) ; dequant

-        mov         rax,            arg(0) ; qcoeff

-        movd        xmm4,           [rax]

-        movd        xmm5,           [rdx]

-        pinsrw      xmm4,           [rax+32],   4

-        pinsrw      xmm5,           [rdx],      4

-        pmullw      xmm4,           xmm5

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; clear coeffs

-        movd        [rax],          xmm5

-        movd        [rax+32],       xmm5

-;pshufb

-        pshuflw     xmm4,           xmm4,       00000000b

-        pshufhw     xmm4,           xmm4,       00000000b

-        mov         rax,            arg(2) ; pre

-        paddw       xmm4,           [GLOBAL(fours)]

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-        psraw       xmm4,           3

-        movq        xmm0,           [rax]

-        movq        xmm1,           [rax+rcx]

-        movq        xmm2,           [rax+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm3,           [rax+rcx]

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-        mov         rax,            arg(3) ; dst

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; store blocks back out

-        movq        [rax],          xmm0

-        movq        [rax + rdx],    xmm1

-        lea         rax,            [rax + 2*rdx]

-        movq        [rax],          xmm2

-        movq        [rax + rdx],    xmm3

-    ; begin epilog

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_idct_dequant_dc_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   short *dc           - 5

-; )

-global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_dc_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        mov         rdx,            arg(5) ; dc

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; load up 2 dc words here == 2*16 = doubleword

-        movd        xmm4,           [rdx]

-    ; Load up predict blocks

-        movq        xmm0,           [rsi]

-        movq        xmm1,           [rsi+16]

-        movq        xmm2,           [rsi+32]

-        movq        xmm3,           [rsi+48]

-    ; Duplicate and expand dc across

-        punpcklwd   xmm4,           xmm4

-        punpckldq   xmm4,           xmm4

-    ; Rounding to dequant and downshift

-        paddw       xmm4,           [GLOBAL(fours)]

-        psraw       xmm4,           3

-    ; Predict buffer needs to be expanded from bytes to words

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_dc_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; DC component

-        mov         rdx,            arg(5)

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; insert DC component

-        pinsrw      xmm0,           [rdx],      0

-        pinsrw      xmm0,           [rdx+2],    4

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+16]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+32]

-        movq        xmm5,           [rsi+48]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-fours:

-    times 8 dw 0x0004

-align 16

-x_s1sqr2:

-    times 8 dw 0x8A8C

-align 16

-x_c1sqr2less1:

-    times 8 dw 0x4E7B

--- a/vp9/common/x86/vp9_idctllm_x86.c

+++ /dev/null

@@ -1,237 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h>

-#include <emmintrin.h>  // SSE2

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_idct.h"

-#if HAVE_SSE2

-// In order to improve performance, clip absolute diff values to [0, 255],

-// which allows to keep the additions/subtractions in 8 bits.

-void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,

-                               uint8_t *dst_ptr, int pitch, int stride) {

-  int a1;

-  int16_t out;

-  uint8_t abs_diff;

-  __m128i p0, p1, p2, p3;

-  unsigned int extended_diff;

-  __m128i diff;

-  out = dct_const_round_shift(input_dc * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  a1 = ROUND_POWER_OF_TWO(out, 4);

-  // Read prediction data.

-  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));

-  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));

-  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));

-  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));

-  // Unpack prediction data, and store 4x4 array in 1 XMM register.

-  p0 = _mm_unpacklo_epi32(p0, p1);

-  p2 = _mm_unpacklo_epi32(p2, p3);

-  p0 = _mm_unpacklo_epi64(p0, p2);

-  // Clip dc value to [0, 255] range. Then, do addition or subtraction

-  // according to its sign.

-  if (a1 >= 0) {

-    abs_diff = (a1 > 255) ? 255 : a1;

-    extended_diff = abs_diff * 0x01010101u;

-    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);

-    p1 = _mm_adds_epu8(p0, diff);

-  } else {

-    abs_diff = (a1 < -255) ? 255 : -a1;

-    extended_diff = abs_diff * 0x01010101u;

-    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);

-    p1 = _mm_subs_epu8(p0, diff);

-  }

-  // Store results to dst.

-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

-  dst_ptr += stride;

-  p1 = _mm_srli_si128(p1, 4);

-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

-  dst_ptr += stride;

-  p1 = _mm_srli_si128(p1, 4);

-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

-  dst_ptr += stride;

-  p1 = _mm_srli_si128(p1, 4);

-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

-}

-void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i eight = _mm_set1_epi16(8);

-  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

-                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

-                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

-                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const int half_pitch = pitch >> 1;

-  __m128i input0, input1, input2, input3;

-  // Rows

-  input0 = _mm_loadl_epi64((__m128i *)input);

-  input1 = _mm_loadl_epi64((__m128i *)(input + 4));

-  input2 = _mm_loadl_epi64((__m128i *)(input + 8));

-  input3 = _mm_loadl_epi64((__m128i *)(input + 12));

-  // Construct i3, i1, i3, i1, i2, i0, i2, i0

-  input0 = _mm_shufflelo_epi16(input0, 0xd8);

-  input1 = _mm_shufflelo_epi16(input1, 0xd8);

-  input2 = _mm_shufflelo_epi16(input2, 0xd8);

-  input3 = _mm_shufflelo_epi16(input3, 0xd8);

-  input0 = _mm_unpacklo_epi32(input0, input0);

-  input1 = _mm_unpacklo_epi32(input1, input1);

-  input2 = _mm_unpacklo_epi32(input2, input2);

-  input3 = _mm_unpacklo_epi32(input3, input3);

-  // Stage 1

-  input0 = _mm_madd_epi16(input0, cst);

-  input1 = _mm_madd_epi16(input1, cst);

-  input2 = _mm_madd_epi16(input2, cst);

-  input3 = _mm_madd_epi16(input3, cst);

-  input0 = _mm_add_epi32(input0, rounding);

-  input1 = _mm_add_epi32(input1, rounding);

-  input2 = _mm_add_epi32(input2, rounding);

-  input3 = _mm_add_epi32(input3, rounding);

-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

-  // Stage 2

-  input0 = _mm_packs_epi32(input0, zero);

-  input1 = _mm_packs_epi32(input1, zero);

-  input2 = _mm_packs_epi32(input2, zero);

-  input3 = _mm_packs_epi32(input3, zero);

-  // Transpose

-  input1 = _mm_unpacklo_epi16(input0, input1);

-  input3 = _mm_unpacklo_epi16(input2, input3);

-  input0 = _mm_unpacklo_epi32(input1, input3);

-  input1 = _mm_unpackhi_epi32(input1, input3);

-  // Switch column2, column 3, and then, we got:

-  // input2: column1, column 0;  input3: column2, column 3.

-  input1 = _mm_shuffle_epi32(input1, 0x4e);

-  input2 = _mm_add_epi16(input0, input1);

-  input3 = _mm_sub_epi16(input0, input1);

-  // Columns

-  // Construct i3, i1, i3, i1, i2, i0, i2, i0

-  input0 = _mm_shufflelo_epi16(input2, 0xd8);

-  input1 = _mm_shufflehi_epi16(input2, 0xd8);

-  input2 = _mm_shufflehi_epi16(input3, 0xd8);

-  input3 = _mm_shufflelo_epi16(input3, 0xd8);

-  input0 = _mm_unpacklo_epi32(input0, input0);

-  input1 = _mm_unpackhi_epi32(input1, input1);

-  input2 = _mm_unpackhi_epi32(input2, input2);

-  input3 = _mm_unpacklo_epi32(input3, input3);

-  // Stage 1

-  input0 = _mm_madd_epi16(input0, cst);

-  input1 = _mm_madd_epi16(input1, cst);

-  input2 = _mm_madd_epi16(input2, cst);

-  input3 = _mm_madd_epi16(input3, cst);

-  input0 = _mm_add_epi32(input0, rounding);

-  input1 = _mm_add_epi32(input1, rounding);

-  input2 = _mm_add_epi32(input2, rounding);

-  input3 = _mm_add_epi32(input3, rounding);

-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

-  // Stage 2

-  input0 = _mm_packs_epi32(input0, zero);

-  input1 = _mm_packs_epi32(input1, zero);

-  input2 = _mm_packs_epi32(input2, zero);

-  input3 = _mm_packs_epi32(input3, zero);

-  // Transpose

-  input1 = _mm_unpacklo_epi16(input0, input1);

-  input3 = _mm_unpacklo_epi16(input2, input3);

-  input0 = _mm_unpacklo_epi32(input1, input3);

-  input1 = _mm_unpackhi_epi32(input1, input3);

-  // Switch column2, column 3, and then, we got:

-  // input2: column1, column 0;  input3: column2, column 3.

-  input1 = _mm_shuffle_epi32(input1, 0x4e);

-  input2 = _mm_add_epi16(input0, input1);

-  input3 = _mm_sub_epi16(input0, input1);

-  // Final round and shift

-  input2 = _mm_add_epi16(input2, eight);

-  input3 = _mm_add_epi16(input3, eight);

-  input2 = _mm_srai_epi16(input2, 4);

-  input3 = _mm_srai_epi16(input3, 4);

-  // Store results

-  _mm_storel_epi64((__m128i *)output, input2);

-  input2 = _mm_srli_si128(input2, 8);

-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);

-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);

-  input3 = _mm_srli_si128(input3, 8);

-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);

-}

-void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

-                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

-                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

-                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

-  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  __m128i in, temp;

-  // Load input data.

-  in = _mm_loadl_epi64((__m128i *)input);

-  // Construct i3, i1, i3, i1, i2, i0, i2, i0

-  in = _mm_shufflelo_epi16(in, 0xd8);

-  in = _mm_unpacklo_epi32(in, in);

-  // Stage 1

-  in = _mm_madd_epi16(in, c1);

-  in = _mm_add_epi32(in, rounding);

-  in = _mm_srai_epi32(in, DCT_CONST_BITS);

-  in = _mm_packs_epi32(in, zero);

-  // Stage 2

-  temp = _mm_shufflelo_epi16(in, 0x9c);

-  in = _mm_shufflelo_epi16(in, 0xc9);

-  in = _mm_unpacklo_epi64(temp, in);

-  in = _mm_madd_epi16(in, c2);

-  in = _mm_packs_epi32(in, zero);

-  // Store results

-  _mm_storel_epi64((__m128i *)output, in);

-}

-#endif

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -124,14 +124,14 @@

   if (mb->lossless) {

     assert(qindex == 0);

-    mb->inv_txm4x4_1      = vp9_short_inv_walsh4x4_1_x8;

-    mb->inv_txm4x4        = vp9_short_inv_walsh4x4_x8;

+    mb->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;

+    mb->inv_txm4x4        = vp9_short_iwalsh4x4;

     mb->itxm_add          = vp9_dequant_idct_add_lossless_c;

     mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block_lossless_c;

     mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;

   } else {

-    mb->inv_txm4x4_1      = vp9_short_idct4x4llm_1;

-    mb->inv_txm4x4        = vp9_short_idct4x4llm;

+    mb->inv_txm4x4_1      = vp9_short_idct4x4_1;

+    mb->inv_txm4x4        = vp9_short_idct4x4;

     mb->itxm_add          = vp9_dequant_idct_add;

     mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block;

     mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;

--- a/vp9/decoder/vp9_dequantize.c

+++ b/vp9/decoder/vp9_dequantize.c

@@ -126,7 +126,7 @@

       input[i] *= dq[i];

     // the idct halves ( >> 1) the pitch

-    vp9_short_idct4x4llm(input, output, 4 << 1);

+    vp9_short_idct4x4(input, output, 4 << 1);

     vpx_memset(input, 0, 32);

@@ -148,7 +148,7 @@

     input[i] *= dq[i];

   // the idct halves ( >> 1) the pitch

-  vp9_short_idct4x4llm(input, output, 4 << 1);

+  vp9_short_idct4x4(input, output, 4 << 1);

   vpx_memset(input, 0, 32);

   vp9_add_residual_4x4(output, pred, pitch, dest, stride);

@@ -163,7 +163,7 @@

     for (i = 0; i < 16; i++)

       input[i] *= dq[i];

-    vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

+    vp9_short_iwalsh4x4_c(input, output, 4 << 1);

     vpx_memset(input, 0, 32);

@@ -186,7 +186,7 @@

   for (i = 1; i < 16; i++)

     input[i] *= dq[i];

-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

+  vp9_short_iwalsh4x4_c(input, output, 4 << 1);

   vpx_memset(input, 0, 32);

   vp9_add_residual_4x4(output, pred, pitch, dest, stride);

--- a/vp9/encoder/vp9_dct.c

+++ b/vp9/encoder/vp9_dct.c

@@ -374,7 +374,7 @@

-void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {

+void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

   int i;

   int a1, b1, c1, d1;

   short *ip = input;

@@ -414,9 +414,9 @@

-void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {

-  vp9_short_walsh4x4_x8_c(input,   output,    pitch);

-  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

+void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {

+  vp9_short_walsh4x4_c(input,   output,    pitch);

+  vp9_short_walsh4x4_c(input + 4, output + 16, pitch);

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1217,10 +1217,10 @@

 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {

   if (lossless) {

-    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4_x8;

-    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4_x8;

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;

+    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;

+    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;

     cpi->mb.optimize              = 0;

     cpi->common.filter_level      = 0;

     cpi->zbin_mode_boost_enabled  = FALSE;

@@ -1228,8 +1228,8 @@

   } else {

     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;

     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;

+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -843,8 +843,8 @@

   cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;

   cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;

   if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {

-    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4_x8;

-    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4_x8;

+    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4;

+    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4;

   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;

@@ -1217,11 +1217,11 @@

   cpi->oxcf.lossless = oxcf->lossless;

   if (cpi->oxcf.lossless) {

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;

+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;

   } else {

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;

+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;

   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -28,7 +28,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_filter.h

 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c

 VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c

-VP9_COMMON_SRCS-yes += common/vp9_idctllm.c

+VP9_COMMON_SRCS-yes += common/vp9_idct.c

 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h

 VP9_COMMON_SRCS-yes += common/vp9_blockd.h

 VP9_COMMON_SRCS-yes += common/vp9_common.h

@@ -91,7 +91,7 @@

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm

@@ -110,13 +110,13 @@

 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm

 endif

-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c

 ifeq ($(HAVE_SSE2),yes)

-vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2

+vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2

 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2

 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2

-vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2

+vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2

 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2

 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2

 endif

--

⑨