shithub: libvpx

Download patch

ref: 81c6041922266cc20cf2507365cfee3335eb83d4
parent: d6290c3b07301936a768f6e0a8455c20076eec17
author: Paul Wilkins <paulwilkins@google.com>
date: Fri Oct 5 07:16:46 EDT 2012

Fix SIMD unsafe use of floating point.

This commit fixes unsafe simd / floating point interactions arising
from the current hybrid and 16x16 transform implementation.
These led to a raft of bugs and issues when the project was
built using VS2008 for Win32 though they did not show up with
the unix builds.

Gerrit makes a meal out of presenting the fix but all I have actually
done is indent the body of each function that uses floating point by
one level and bracket with emms instructions using  the function
vp8_clear_system_state(). See below.

function () {
  vp8_clear_system_state();
  {
  ... function body
  }
  vp8_clear_system_state();
}

This is almost certainly over the top in terms of number of emms
instructions but is a temporary measure pending implementation of
integer variants of each function to replace the floating point.

Limited testing suggests that this fixes the problems that arose for
Win32 VS2008 when the hybrid or 16x16 transforms were enabled.

Change-Id: I7c9a72bd79315246ed880578dec51e2b7c178442

--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -24,6 +24,7 @@
  **************************************************************************/
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/systemdependent.h"
 
 #if CONFIG_HYBRIDTRANSFORM
 #include "vp8/common/blockd.h"
@@ -166,93 +167,72 @@
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 void vp8_ihtllm_c(short *input, short *output, int pitch,
                   TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  float bufa[256], bufb[256]; // buffers are for floating-point test purpose
-                            // the implementation could be simplified in
-                            // conjunction with integer transform
 
-                            // further notice, since we are thinking to use one
-                            // function for both 4x4 and 8x8 transforms, the
-                            // temporary buffers are simply initialized with 64.
-  short *ip = input;
-  short *op = output;
-  int shortpitch = pitch >> 1;
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    int i, j, k;
+    float bufa[256], bufb[256]; // buffers are for floating-point test purpose
+                                // the implementation could be simplified in
+                                // conjunction with integer transform
 
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
+                                // further notice, since we are thinking to use
+                                // one function for both 4x4 and 8x8 transforms
+                                // the temporary buffers are simply initialized
+                                // with 64.
+    short *ip = input;
+    short *op = output;
+    int shortpitch = pitch >> 1;
 
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
+    float *pfa = &bufa[0];
+    float *pfb = &bufb[0];
 
-  // load and convert residual array into floating-point
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += tx_dim;
-    ip  += tx_dim;
-  }
+    // pointers to vertical and horizontal transforms
+    float *ptv, *pth;
 
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &iadst_4[0] :
-                            ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-      break;
-
-    default :
-      ptv = (tx_dim == 4) ? &idct_4[0] :
-                            ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-      break;
-  }
-
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfb[i] = 0 ;
-      for(k = 0; k < tx_dim; k++) {
-        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+    // load and convert residual array into floating-point
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = (float)ip[i];
       }
-      pfa += 1;
+      pfa += tx_dim;
+      ip  += tx_dim;
     }
 
-    pfb += tx_dim;
-    ptv += tx_dim;
+    // vertical transformation
     pfa = &bufa[0];
-  }
+    pfb = &bufb[0];
 
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
+    switch(tx_type) {
+      case ADST_ADST :
+      case ADST_DCT  :
+        ptv = (tx_dim == 4) ? &iadst_4[0] :
+                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+        break;
 
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = (tx_dim == 4) ? &iadst_4[0] :
-                            ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-      break;
+      default :
+        ptv = (tx_dim == 4) ? &idct_4[0] :
+                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+        break;
+    }
 
-    default :
-      pth = (tx_dim == 4) ? &idct_4[0] :
-                            ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-      break;
-  }
-
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < tx_dim; k++) {
-        pfa[i] += pfb[k] * pth[k];
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfb[i] = 0 ;
+        for(k = 0; k < tx_dim; k++) {
+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+        }
+        pfa += 1;
       }
-      pth += tx_dim;
-     }
 
-    pfa += tx_dim;
-    pfb += tx_dim;
+      pfb += tx_dim;
+      ptv += tx_dim;
+      pfa = &bufa[0];
+    }
 
+    // horizontal transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
+
     switch(tx_type) {
       case ADST_ADST :
       case  DCT_ADST :
@@ -265,21 +245,48 @@
                               ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
         break;
     }
-  }
 
-  // convert to short integer format and load BLOCKD buffer
-  op  = output;
-  pfa = &bufa[0];
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfa[i] += pfb[k] * pth[k];
+        }
+        pth += tx_dim;
+       }
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
-                             -(short)( - pfa[i] / 8 + 0.49);
+      pfa += tx_dim;
+      pfb += tx_dim;
+
+      switch(tx_type) {
+        case ADST_ADST :
+        case  DCT_ADST :
+          pth = (tx_dim == 4) ? &iadst_4[0] :
+                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+          break;
+
+        default :
+          pth = (tx_dim == 4) ? &idct_4[0] :
+                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+          break;
+      }
     }
 
-    op  += shortpitch;
-    pfa += tx_dim;
+    // convert to short integer format and load BLOCKD buffer
+    op  = output;
+    pfa = &bufa[0];
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
+                               -(short)( - pfa[i] / 8 + 0.49);
+      }
+
+      op  += shortpitch;
+      pfa += tx_dim;
+    }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
@@ -776,25 +783,30 @@
 #if 0
 // Keep a really bad float version as reference for now.
 void vp8_short_idct16x16_c(short *input, short *output, int pitch) {
-  double x;
-  const int short_pitch = pitch >> 1;
-  int i, j, k, l;
-  for (l = 0; l < 16; ++l) {
-    for (k = 0; k < 16; ++k) {
-      double s = 0;
-      for (i = 0; i < 16; ++i) {
-        for (j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double x;
+    const int short_pitch = pitch >> 1;
+    int i, j, k, l;
+    for (l = 0; l < 16; ++l) {
+      for (k = 0; k < 16; ++k) {
+        double s = 0;
+        for (i = 0; i < 16; ++i) {
+          for (j = 0; j < 16; ++j) {
+            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
+            if (i != 0)
+              x *= sqrt(2.0);
+            if (j != 0)
+              x *= sqrt(2.0);
+            s += x;
+          }
         }
+        output[k*short_pitch+l] = (short)round(s);
       }
-      output[k*short_pitch+l] = (short)round(s);
     }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
@@ -816,231 +828,246 @@
 
 
 static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
-  double step[16];
-  double intermediate[16];
-  double temp1, temp2;
 
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
 
-  // step 1 and 2
-  step[ 0] = input[0] + input[8];
-  step[ 1] = input[0] - input[8];
 
-  temp1 = input[4]*C12;
-  temp2 = input[12]*C4;
+    // step 1 and 2
+    step[ 0] = input[0] + input[8];
+    step[ 1] = input[0] - input[8];
 
-  temp1 -= temp2;
-  temp1 *= C8;
+    temp1 = input[4]*C12;
+    temp2 = input[12]*C4;
 
-  step[ 2] = 2*(temp1);
+    temp1 -= temp2;
+    temp1 *= C8;
 
-  temp1 = input[4]*C4;
-  temp2 = input[12]*C12;
-  temp1 += temp2;
-  temp1 = (temp1);
-  temp1 *= C8;
-  step[ 3] = 2*(temp1);
+    step[ 2] = 2*(temp1);
 
-  temp1 = input[2]*C8;
-  temp1 = 2*(temp1);
-  temp2 = input[6] + input[10];
+    temp1 = input[4]*C4;
+    temp2 = input[12]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    step[ 3] = 2*(temp1);
 
-  step[ 4] = temp1 + temp2;
-  step[ 5] = temp1 - temp2;
+    temp1 = input[2]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] + input[10];
 
-  temp1 = input[14]*C8;
-  temp1 = 2*(temp1);
-  temp2 = input[6] - input[10];
+    step[ 4] = temp1 + temp2;
+    step[ 5] = temp1 - temp2;
 
-  step[ 6] = temp2 - temp1;
-  step[ 7] = temp2 + temp1;
+    temp1 = input[14]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] - input[10];
 
-  // for odd input
-  temp1 = input[3]*C12;
-  temp2 = input[13]*C4;
-  temp1 += temp2;
-  temp1 = (temp1);
-  temp1 *= C8;
-  intermediate[ 8] = 2*(temp1);
+    step[ 6] = temp2 - temp1;
+    step[ 7] = temp2 + temp1;
 
-  temp1 = input[3]*C4;
-  temp2 = input[13]*C12;
-  temp2 -= temp1;
-  temp2 = (temp2);
-  temp2 *= C8;
-  intermediate[ 9] = 2*(temp2);
+    // for odd input
+    temp1 = input[3]*C12;
+    temp2 = input[13]*C4;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[ 8] = 2*(temp1);
 
-  intermediate[10] = 2*(input[9]*C8);
-  intermediate[11] = input[15] - input[1];
-  intermediate[12] = input[15] + input[1];
-  intermediate[13] = 2*((input[7]*C8));
+    temp1 = input[3]*C4;
+    temp2 = input[13]*C12;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[ 9] = 2*(temp2);
 
-  temp1 = input[11]*C12;
-  temp2 = input[5]*C4;
-  temp2 -= temp1;
-  temp2 = (temp2);
-  temp2 *= C8;
-  intermediate[14] = 2*(temp2);
+    intermediate[10] = 2*(input[9]*C8);
+    intermediate[11] = input[15] - input[1];
+    intermediate[12] = input[15] + input[1];
+    intermediate[13] = 2*((input[7]*C8));
 
-  temp1 = input[11]*C4;
-  temp2 = input[5]*C12;
-  temp1 += temp2;
-  temp1 = (temp1);
-  temp1 *= C8;
-  intermediate[15] = 2*(temp1);
+    temp1 = input[11]*C12;
+    temp2 = input[5]*C4;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[14] = 2*(temp2);
 
-  step[ 8] = intermediate[ 8] + intermediate[14];
-  step[ 9] = intermediate[ 9] + intermediate[15];
-  step[10] = intermediate[10] + intermediate[11];
-  step[11] = intermediate[10] - intermediate[11];
-  step[12] = intermediate[12] + intermediate[13];
-  step[13] = intermediate[12] - intermediate[13];
-  step[14] = intermediate[ 8] - intermediate[14];
-  step[15] = intermediate[ 9] - intermediate[15];
+    temp1 = input[11]*C4;
+    temp2 = input[5]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[15] = 2*(temp1);
 
-  // step 3
-  output[0] = step[ 0] + step[ 3];
-  output[1] = step[ 1] + step[ 2];
-  output[2] = step[ 1] - step[ 2];
-  output[3] = step[ 0] - step[ 3];
+    step[ 8] = intermediate[ 8] + intermediate[14];
+    step[ 9] = intermediate[ 9] + intermediate[15];
+    step[10] = intermediate[10] + intermediate[11];
+    step[11] = intermediate[10] - intermediate[11];
+    step[12] = intermediate[12] + intermediate[13];
+    step[13] = intermediate[12] - intermediate[13];
+    step[14] = intermediate[ 8] - intermediate[14];
+    step[15] = intermediate[ 9] - intermediate[15];
 
-  temp1 = step[ 4]*C14;
-  temp2 = step[ 7]*C2;
-  temp1 -= temp2;
-  output[4] =  (temp1);
+    // step 3
+    output[0] = step[ 0] + step[ 3];
+    output[1] = step[ 1] + step[ 2];
+    output[2] = step[ 1] - step[ 2];
+    output[3] = step[ 0] - step[ 3];
 
-  temp1 = step[ 4]*C2;
-  temp2 = step[ 7]*C14;
-  temp1 += temp2;
-  output[7] =  (temp1);
+    temp1 = step[ 4]*C14;
+    temp2 = step[ 7]*C2;
+    temp1 -= temp2;
+    output[4] =  (temp1);
 
-  temp1 = step[ 5]*C10;
-  temp2 = step[ 6]*C6;
-  temp1 -= temp2;
-  output[5] =  (temp1);
+    temp1 = step[ 4]*C2;
+    temp2 = step[ 7]*C14;
+    temp1 += temp2;
+    output[7] =  (temp1);
 
-  temp1 = step[ 5]*C6;
-  temp2 = step[ 6]*C10;
-  temp1 += temp2;
-  output[6] =  (temp1);
+    temp1 = step[ 5]*C10;
+    temp2 = step[ 6]*C6;
+    temp1 -= temp2;
+    output[5] =  (temp1);
 
-  output[8] = step[ 8] + step[11];
-  output[9] = step[ 9] + step[10];
-  output[10] = step[ 9] - step[10];
-  output[11] = step[ 8] - step[11];
-  output[12] = step[12] + step[15];
-  output[13] = step[13] + step[14];
-  output[14] = step[13] - step[14];
-  output[15] = step[12] - step[15];
+    temp1 = step[ 5]*C6;
+    temp2 = step[ 6]*C10;
+    temp1 += temp2;
+    output[6] =  (temp1);
 
-  // output 4
-  step[ 0] = output[0] + output[7];
-  step[ 1] = output[1] + output[6];
-  step[ 2] = output[2] + output[5];
-  step[ 3] = output[3] + output[4];
-  step[ 4] = output[3] - output[4];
-  step[ 5] = output[2] - output[5];
-  step[ 6] = output[1] - output[6];
-  step[ 7] = output[0] - output[7];
+    output[8] = step[ 8] + step[11];
+    output[9] = step[ 9] + step[10];
+    output[10] = step[ 9] - step[10];
+    output[11] = step[ 8] - step[11];
+    output[12] = step[12] + step[15];
+    output[13] = step[13] + step[14];
+    output[14] = step[13] - step[14];
+    output[15] = step[12] - step[15];
 
-  temp1 = output[8]*C7;
-  temp2 = output[15]*C9;
-  temp1 -= temp2;
-  step[ 8] = (temp1);
+    // output 4
+    step[ 0] = output[0] + output[7];
+    step[ 1] = output[1] + output[6];
+    step[ 2] = output[2] + output[5];
+    step[ 3] = output[3] + output[4];
+    step[ 4] = output[3] - output[4];
+    step[ 5] = output[2] - output[5];
+    step[ 6] = output[1] - output[6];
+    step[ 7] = output[0] - output[7];
 
-  temp1 = output[9]*C11;
-  temp2 = output[14]*C5;
-  temp1 += temp2;
-  step[ 9] = (temp1);
+    temp1 = output[8]*C7;
+    temp2 = output[15]*C9;
+    temp1 -= temp2;
+    step[ 8] = (temp1);
 
-  temp1 = output[10]*C3;
-  temp2 = output[13]*C13;
-  temp1 -= temp2;
-  step[10] = (temp1);
+    temp1 = output[9]*C11;
+    temp2 = output[14]*C5;
+    temp1 += temp2;
+    step[ 9] = (temp1);
 
-  temp1 = output[11]*C15;
-  temp2 = output[12]*C1;
-  temp1 += temp2;
-  step[11] = (temp1);
+    temp1 = output[10]*C3;
+    temp2 = output[13]*C13;
+    temp1 -= temp2;
+    step[10] = (temp1);
 
-  temp1 = output[11]*C1;
-  temp2 = output[12]*C15;
-  temp2 -= temp1;
-  step[12] = (temp2);
+    temp1 = output[11]*C15;
+    temp2 = output[12]*C1;
+    temp1 += temp2;
+    step[11] = (temp1);
 
-  temp1 = output[10]*C13;
-  temp2 = output[13]*C3;
-  temp1 += temp2;
-  step[13] = (temp1);
+    temp1 = output[11]*C1;
+    temp2 = output[12]*C15;
+    temp2 -= temp1;
+    step[12] = (temp2);
 
-  temp1 = output[9]*C5;
-  temp2 = output[14]*C11;
-  temp2 -= temp1;
-  step[14] = (temp2);
+    temp1 = output[10]*C13;
+    temp2 = output[13]*C3;
+    temp1 += temp2;
+    step[13] = (temp1);
 
-  temp1 = output[8]*C9;
-  temp2 = output[15]*C7;
-  temp1 += temp2;
-  step[15] = (temp1);
+    temp1 = output[9]*C5;
+    temp2 = output[14]*C11;
+    temp2 -= temp1;
+    step[14] = (temp2);
 
-  // step 5
-  output[0] = (step[0] + step[15]);
-  output[1] = (step[1] + step[14]);
-  output[2] = (step[2] + step[13]);
-  output[3] = (step[3] + step[12]);
-  output[4] = (step[4] + step[11]);
-  output[5] = (step[5] + step[10]);
-  output[6] = (step[6] + step[ 9]);
-  output[7] = (step[7] + step[ 8]);
+    temp1 = output[8]*C9;
+    temp2 = output[15]*C7;
+    temp1 += temp2;
+    step[15] = (temp1);
 
-  output[15] = (step[0] - step[15]);
-  output[14] = (step[1] - step[14]);
-  output[13] = (step[2] - step[13]);
-  output[12] = (step[3] - step[12]);
-  output[11] = (step[4] - step[11]);
-  output[10] = (step[5] - step[10]);
-  output[9] = (step[6] - step[ 9]);
-  output[8] = (step[7] - step[ 8]);
+    // step 5
+    output[0] = (step[0] + step[15]);
+    output[1] = (step[1] + step[14]);
+    output[2] = (step[2] + step[13]);
+    output[3] = (step[3] + step[12]);
+    output[4] = (step[4] + step[11]);
+    output[5] = (step[5] + step[10]);
+    output[6] = (step[6] + step[ 9]);
+    output[7] = (step[7] + step[ 8]);
+
+    output[15] = (step[0] - step[15]);
+    output[14] = (step[1] - step[14]);
+    output[13] = (step[2] - step[13]);
+    output[12] = (step[3] - step[12]);
+    output[11] = (step[4] - step[11]);
+    output[10] = (step[5] - step[10]);
+    output[9] = (step[6] - step[ 9]);
+    output[8] = (step[7] - step[ 8]);
+  }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 
 // Remove once an int version of iDCT is written
 #if 0
 void reference_16x16_idct_1d(double input[16], double output[16]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kSqrt2 = 1.414213562373095048801688724209698;
-  for (int k = 0; k < 16; k++) {
-    output[k] = 0.0;
-    for (int n = 0; n < 16; n++) {
-      output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
-      if (n == 0)
-        output[k] = output[k]/kSqrt2;
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    const double kPi = 3.141592653589793238462643383279502884;
+    const double kSqrt2 = 1.414213562373095048801688724209698;
+    for (int k = 0; k < 16; k++) {
+      output[k] = 0.0;
+      for (int n = 0; n < 16; n++) {
+        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
+        if (n == 0)
+          output[k] = output[k]/kSqrt2;
+      }
     }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
 void vp8_short_idct16x16_c(short *input, short *output, int pitch) {
-  double out[16*16], out2[16*16];
-  const int short_pitch = pitch >> 1;
-  int i, j;
-    // First transform rows
-  for (i = 0; i < 16; ++i) {
-    double temp_in[16], temp_out[16];
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j + i*short_pitch];
-    butterfly_16x16_idct_1d(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      out[j + i*16] = temp_out[j];
+
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double out[16*16], out2[16*16];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+      // First transform rows
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_16x16_idct_1d(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out[j + i*16] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j*16 + i];
+      butterfly_16x16_idct_1d(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out2[j*16 + i] = temp_out[j];
+    }
+    for (i = 0; i < 16*16; ++i)
+      output[i] = round(out2[i]/128);
   }
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    double temp_in[16], temp_out[16];
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
-    butterfly_16x16_idct_1d(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      out2[j*16 + i] = temp_out[j];
-  }
-  for (i = 0; i < 16*16; ++i)
-    output[i] = round(out2[i]/128);
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/systemdependent.h"
 
 #if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
 
@@ -402,87 +403,64 @@
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
 void vp8_fht_c(short *input, short *output, int pitch,
                TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  float bufa[256], bufb[256]; // buffers are for floating-point test purpose
-                             // the implementation could be simplified in
-                             // conjunction with integer transform
-  short *ip = input;
-  short *op = output;
 
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    int i, j, k;
+    float bufa[256], bufb[256]; // buffers are for floating-point test purpose
+                               // the implementation could be simplified in
+                               // conjunction with integer transform
+    short *ip = input;
+    short *op = output;
 
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
+    float *pfa = &bufa[0];
+    float *pfb = &bufb[0];
 
-  // load and convert residual array into floating-point
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += tx_dim;
-    ip  += pitch / 2;
-  }
+    // pointers to vertical and horizontal transforms
+    float *ptv, *pth;
 
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &adst_4[0] :
-                            ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-      break;
-
-    default :
-      ptv = (tx_dim == 4) ? &dct_4[0] :
-                            ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-      break;
-  }
-
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfb[i] = 0;
-      for(k = 0; k < tx_dim; k++) {
-        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+    // load and convert residual array into floating-point
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = (float)ip[i];
       }
-      pfa += 1;
+      pfa += tx_dim;
+      ip  += pitch / 2;
     }
-    pfb += tx_dim;
-    ptv += tx_dim;
+
+    // vertical transformation
     pfa = &bufa[0];
-  }
+    pfb = &bufb[0];
 
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
+    switch(tx_type) {
+      case ADST_ADST :
+      case ADST_DCT  :
+        ptv = (tx_dim == 4) ? &adst_4[0] :
+                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+        break;
 
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = (tx_dim == 4) ? &adst_4[0] :
-                            ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-      break;
+      default :
+        ptv = (tx_dim == 4) ? &dct_4[0] :
+                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+        break;
+    }
 
-    default :
-      pth = (tx_dim == 4) ? &dct_4[0] :
-                            ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-      break;
-  }
-
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < tx_dim; k++) {
-        pfa[i] += pfb[k] * pth[k];
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfb[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+        }
+        pfa += 1;
       }
-      pth += tx_dim;
+      pfb += tx_dim;
+      ptv += tx_dim;
+      pfa = &bufa[0];
     }
 
-    pfa += tx_dim;
-    pfb += tx_dim;
-    // pth -= tx_dim * tx_dim;
+    // horizontal transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
 
     switch(tx_type) {
       case ADST_ADST :
@@ -496,20 +474,48 @@
                               ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
         break;
     }
-  }
 
-  // convert to short integer format and load BLOCKD buffer
-  op  = output ;
-  pfa = &bufa[0] ;
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfa[i] += pfb[k] * pth[k];
+        }
+        pth += tx_dim;
+      }
 
-  for(j = 0; j < tx_dim; j++) {
-    for(i = 0; i < tx_dim; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
-                                   -(short)(- 8 * pfa[i] + 0.49);
+      pfa += tx_dim;
+      pfb += tx_dim;
+      // pth -= tx_dim * tx_dim;
+
+      switch(tx_type) {
+        case ADST_ADST :
+        case  DCT_ADST :
+          pth = (tx_dim == 4) ? &adst_4[0] :
+                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+          break;
+
+        default :
+          pth = (tx_dim == 4) ? &dct_4[0] :
+                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+          break;
+      }
     }
-    op  += tx_dim;
-    pfa += tx_dim;
+
+    // convert to short integer format and load BLOCKD buffer
+    op  = output ;
+    pfa = &bufa[0] ;
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
+                                     -(short)(- 8 * pfa[i] + 0.49);
+      }
+      op  += tx_dim;
+      pfa += tx_dim;
+    }
   }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif
 
@@ -705,162 +711,168 @@
 static const double C15 = 0.098017140329561;
 
 static void dct16x16_1d(double input[16], double output[16]) {
-  double step[16];
-  double intermediate[16];
-  double temp1, temp2;
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
 
-  // step 1
-  step[ 0] = input[0] + input[15];
-  step[ 1] = input[1] + input[14];
-  step[ 2] = input[2] + input[13];
-  step[ 3] = input[3] + input[12];
-  step[ 4] = input[4] + input[11];
-  step[ 5] = input[5] + input[10];
-  step[ 6] = input[6] + input[ 9];
-  step[ 7] = input[7] + input[ 8];
-  step[ 8] = input[7] - input[ 8];
-  step[ 9] = input[6] - input[ 9];
-  step[10] = input[5] - input[10];
-  step[11] = input[4] - input[11];
-  step[12] = input[3] - input[12];
-  step[13] = input[2] - input[13];
-  step[14] = input[1] - input[14];
-  step[15] = input[0] - input[15];
+    // step 1
+    step[ 0] = input[0] + input[15];
+    step[ 1] = input[1] + input[14];
+    step[ 2] = input[2] + input[13];
+    step[ 3] = input[3] + input[12];
+    step[ 4] = input[4] + input[11];
+    step[ 5] = input[5] + input[10];
+    step[ 6] = input[6] + input[ 9];
+    step[ 7] = input[7] + input[ 8];
+    step[ 8] = input[7] - input[ 8];
+    step[ 9] = input[6] - input[ 9];
+    step[10] = input[5] - input[10];
+    step[11] = input[4] - input[11];
+    step[12] = input[3] - input[12];
+    step[13] = input[2] - input[13];
+    step[14] = input[1] - input[14];
+    step[15] = input[0] - input[15];
 
-  // step 2
-  output[0] = step[0] + step[7];
-  output[1] = step[1] + step[6];
-  output[2] = step[2] + step[5];
-  output[3] = step[3] + step[4];
-  output[4] = step[3] - step[4];
-  output[5] = step[2] - step[5];
-  output[6] = step[1] - step[6];
-  output[7] = step[0] - step[7];
+    // step 2
+    output[0] = step[0] + step[7];
+    output[1] = step[1] + step[6];
+    output[2] = step[2] + step[5];
+    output[3] = step[3] + step[4];
+    output[4] = step[3] - step[4];
+    output[5] = step[2] - step[5];
+    output[6] = step[1] - step[6];
+    output[7] = step[0] - step[7];
 
-  temp1 = step[ 8]*C7;
-  temp2 = step[15]*C9;
-  output[ 8] = temp1 + temp2;
+    temp1 = step[ 8]*C7;
+    temp2 = step[15]*C9;
+    output[ 8] = temp1 + temp2;
 
-  temp1 = step[ 9]*C11;
-  temp2 = step[14]*C5;
-  output[ 9] = temp1 - temp2;
+    temp1 = step[ 9]*C11;
+    temp2 = step[14]*C5;
+    output[ 9] = temp1 - temp2;
 
-  temp1 = step[10]*C3;
-  temp2 = step[13]*C13;
-  output[10] = temp1 + temp2;
+    temp1 = step[10]*C3;
+    temp2 = step[13]*C13;
+    output[10] = temp1 + temp2;
 
-  temp1 = step[11]*C15;
-  temp2 = step[12]*C1;
-  output[11] = temp1 - temp2;
+    temp1 = step[11]*C15;
+    temp2 = step[12]*C1;
+    output[11] = temp1 - temp2;
 
-  temp1 = step[11]*C1;
-  temp2 = step[12]*C15;
-  output[12] = temp2 + temp1;
+    temp1 = step[11]*C1;
+    temp2 = step[12]*C15;
+    output[12] = temp2 + temp1;
 
-  temp1 = step[10]*C13;
-  temp2 = step[13]*C3;
-  output[13] = temp2 - temp1;
+    temp1 = step[10]*C13;
+    temp2 = step[13]*C3;
+    output[13] = temp2 - temp1;
 
-  temp1 = step[ 9]*C5;
-  temp2 = step[14]*C11;
-  output[14] = temp2 + temp1;
+    temp1 = step[ 9]*C5;
+    temp2 = step[14]*C11;
+    output[14] = temp2 + temp1;
 
-  temp1 = step[ 8]*C9;
-  temp2 = step[15]*C7;
-  output[15] = temp2 - temp1;
+    temp1 = step[ 8]*C9;
+    temp2 = step[15]*C7;
+    output[15] = temp2 - temp1;
 
-  // step 3
-  step[ 0] = output[0] + output[3];
-  step[ 1] = output[1] + output[2];
-  step[ 2] = output[1] - output[2];
-  step[ 3] = output[0] - output[3];
+    // step 3
+    step[ 0] = output[0] + output[3];
+    step[ 1] = output[1] + output[2];
+    step[ 2] = output[1] - output[2];
+    step[ 3] = output[0] - output[3];
 
-  temp1 = output[4]*C14;
-  temp2 = output[7]*C2;
-  step[ 4] = temp1 + temp2;
+    temp1 = output[4]*C14;
+    temp2 = output[7]*C2;
+    step[ 4] = temp1 + temp2;
 
-  temp1 = output[5]*C10;
-  temp2 = output[6]*C6;
-  step[ 5] = temp1 + temp2;
+    temp1 = output[5]*C10;
+    temp2 = output[6]*C6;
+    step[ 5] = temp1 + temp2;
 
-  temp1 = output[5]*C6;
-  temp2 = output[6]*C10;
-  step[ 6] = temp2 - temp1;
+    temp1 = output[5]*C6;
+    temp2 = output[6]*C10;
+    step[ 6] = temp2 - temp1;
 
-  temp1 = output[4]*C2;
-  temp2 = output[7]*C14;
-  step[ 7] = temp2 - temp1;
+    temp1 = output[4]*C2;
+    temp2 = output[7]*C14;
+    step[ 7] = temp2 - temp1;
 
-  step[ 8] = output[ 8] + output[11];
-  step[ 9] = output[ 9] + output[10];
-  step[10] = output[ 9] - output[10];
-  step[11] = output[ 8] - output[11];
+    step[ 8] = output[ 8] + output[11];
+    step[ 9] = output[ 9] + output[10];
+    step[10] = output[ 9] - output[10];
+    step[11] = output[ 8] - output[11];
 
-  step[12] = output[12] + output[15];
-  step[13] = output[13] + output[14];
-  step[14] = output[13] - output[14];
-  step[15] = output[12] - output[15];
+    step[12] = output[12] + output[15];
+    step[13] = output[13] + output[14];
+    step[14] = output[13] - output[14];
+    step[15] = output[12] - output[15];
 
-  // step 4
-  output[ 0] = (step[ 0] + step[ 1]);
-  output[ 8] = (step[ 0] - step[ 1]);
+    // step 4
+    output[ 0] = (step[ 0] + step[ 1]);
+    output[ 8] = (step[ 0] - step[ 1]);
 
-  temp1 = step[2]*C12;
-  temp2 = step[3]*C4;
-  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1*C8);
+    temp1 = step[2]*C12;
+    temp2 = step[3]*C4;
+    temp1 = temp1 + temp2;
+    output[ 4] = 2*(temp1*C8);
 
-  temp1 = step[2]*C4;
-  temp2 = step[3]*C12;
-  temp1 = temp2 - temp1;
-  output[12] = 2*(temp1*C8);
+    temp1 = step[2]*C4;
+    temp2 = step[3]*C12;
+    temp1 = temp2 - temp1;
+    output[12] = 2*(temp1*C8);
 
-  output[ 2] = 2*((step[4] + step[ 5])*C8);
-  output[14] = 2*((step[7] - step[ 6])*C8);
+    output[ 2] = 2*((step[4] + step[ 5])*C8);
+    output[14] = 2*((step[7] - step[ 6])*C8);
 
-  temp1 = step[4] - step[5];
-  temp2 = step[6] + step[7];
-  output[ 6] = (temp1 + temp2);
-  output[10] = (temp1 - temp2);
+    temp1 = step[4] - step[5];
+    temp2 = step[6] + step[7];
+    output[ 6] = (temp1 + temp2);
+    output[10] = (temp1 - temp2);
 
-  intermediate[8] = step[8] + step[14];
-  intermediate[9] = step[9] + step[15];
+    intermediate[8] = step[8] + step[14];
+    intermediate[9] = step[9] + step[15];
 
-  temp1 = intermediate[8]*C12;
-  temp2 = intermediate[9]*C4;
-  temp1 = temp1 - temp2;
-  output[3] = 2*(temp1*C8);
+    temp1 = intermediate[8]*C12;
+    temp2 = intermediate[9]*C4;
+    temp1 = temp1 - temp2;
+    output[3] = 2*(temp1*C8);
 
-  temp1 = intermediate[8]*C4;
-  temp2 = intermediate[9]*C12;
-  temp1 = temp2 + temp1;
-  output[13] = 2*(temp1*C8);
+    temp1 = intermediate[8]*C4;
+    temp2 = intermediate[9]*C12;
+    temp1 = temp2 + temp1;
+    output[13] = 2*(temp1*C8);
 
-  output[ 9] = 2*((step[10] + step[11])*C8);
+    output[ 9] = 2*((step[10] + step[11])*C8);
 
-  intermediate[11] = step[10] - step[11];
-  intermediate[12] = step[12] + step[13];
-  intermediate[13] = step[12] - step[13];
-  intermediate[14] = step[ 8] - step[14];
-  intermediate[15] = step[ 9] - step[15];
+    intermediate[11] = step[10] - step[11];
+    intermediate[12] = step[12] + step[13];
+    intermediate[13] = step[12] - step[13];
+    intermediate[14] = step[ 8] - step[14];
+    intermediate[15] = step[ 9] - step[15];
 
-  output[15] = (intermediate[11] + intermediate[12]);
-  output[ 1] = -(intermediate[11] - intermediate[12]);
+    output[15] = (intermediate[11] + intermediate[12]);
+    output[ 1] = -(intermediate[11] - intermediate[12]);
 
-  output[ 7] = 2*(intermediate[13]*C8);
+    output[ 7] = 2*(intermediate[13]*C8);
 
-  temp1 = intermediate[14]*C12;
-  temp2 = intermediate[15]*C4;
-  temp1 = temp1 - temp2;
-  output[11] = -2*(temp1*C8);
+    temp1 = intermediate[14]*C12;
+    temp2 = intermediate[15]*C4;
+    temp1 = temp1 - temp2;
+    output[11] = -2*(temp1*C8);
 
-  temp1 = intermediate[14]*C4;
-  temp2 = intermediate[15]*C12;
-  temp1 = temp2 + temp1;
-  output[ 5] = 2*(temp1*C8);
+    temp1 = intermediate[14]*C4;
+    temp2 = intermediate[15]*C12;
+    temp1 = temp2 + temp1;
+    output[ 5] = 2*(temp1*C8);
+  }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 
 void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  {
     int shortpitch = pitch >> 1;
     int i, j;
     double output[256];
@@ -885,5 +897,7 @@
     // Scale by some magic number
     for (i = 0; i < 256; i++)
         out[i] = (short)round(output[i]/2);
+  }
+  vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 #endif