shithub: libvpx

Download patch

ref: 714aa9f3c072624186df161589bacbb778369312
parent: b715e371c05324c84b3a58ca19f5348caa2ff695
author: Jim Bankoski <jimbankoski@google.com>
date: Thu Feb 28 03:32:14 EST 2013

this commit converts all sad ptrs to uint32

sse4_1 code used uint16_t for returning sad, but that
won't work for 32x32 or 64x64.   This code fixes the
assembly for those and also reenables sse4_1 on linux

Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81

--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -997,17 +997,6 @@
 #error "not x32"
 #endif
 EOF
-        soft_enable runtime_cpu_detect
-        soft_enable mmx
-        soft_enable sse
-        soft_enable sse2
-        soft_enable sse3
-        soft_enable ssse3
-        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
-            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
-        else
-            soft_enable sse4_1
-        fi
 
         case  ${tgt_os} in
             win*)
@@ -1060,6 +1049,18 @@
                 AS=msvs
             ;;
         esac
+
+        soft_enable runtime_cpu_detect
+        soft_enable mmx
+        soft_enable sse
+        soft_enable sse2
+        soft_enable sse3
+        soft_enable ssse3
+        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
+        else
+            soft_enable sse4_1
+        fi
 
         case "${AS}" in
             auto|"")
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -449,25 +449,25 @@
 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x3 sse3
 
-prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad64x64x8
 
-prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad32x32x8
 
-prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad16x16x8 sse4
 
-prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad16x8x8 sse4
 
-prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x16x8 sse4
 
-prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x8x8 sse4
 
-prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
@@ -490,7 +490,6 @@
 
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
-
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1782,7 +1782,7 @@
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
 
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
+  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
   unsigned int sad_array[3];
   int_mv fcenter_mv;
 
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -103,31 +103,31 @@
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad32x32x8_c(const uint8_t *src_ptr,
@@ -134,31 +134,31 @@
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad16x16x3_c(const uint8_t *src_ptr,
@@ -178,31 +178,31 @@
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                      uint32_t *sad_array) {
+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad16x8x3_c(const uint8_t *src_ptr,
@@ -222,31 +222,31 @@
                      int  src_stride,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
-                     uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr, ref_stride,
-                                       0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 1, ref_stride,
-                                       0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 2, ref_stride,
-                                       0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 3, ref_stride,
-                                       0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 4, ref_stride,
-                                       0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 5, ref_stride,
-                                       0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 6, ref_stride,
-                                       0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
-                                       ref_ptr + 7, ref_stride,
-                                       0x7fffffff);
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad8x8x3_c(const uint8_t *src_ptr,
@@ -266,31 +266,31 @@
                     int  src_stride,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
-                    uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr, ref_stride,
-                                      0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 1, ref_stride,
-                                      0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 2, ref_stride,
-                                      0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 3, ref_stride,
-                                      0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 4, ref_stride,
-                                      0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 5, ref_stride,
-                                      0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 6, ref_stride,
-                                      0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
-                                      ref_ptr + 7, ref_stride,
-                                      0x7fffffff);
+                    uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr, ref_stride,
+                            0x7fffffff);
+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride,
+                            0x7fffffff);
+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride,
+                            0x7fffffff);
+  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 3, ref_stride,
+                            0x7fffffff);
+  sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 4, ref_stride,
+                            0x7fffffff);
+  sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 5, ref_stride,
+                            0x7fffffff);
+  sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 6, ref_stride,
+                            0x7fffffff);
+  sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 7, ref_stride,
+                            0x7fffffff);
 }
 
 void vp9_sad8x16x3_c(const uint8_t *src_ptr,
@@ -310,31 +310,31 @@
                      int  src_stride,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
-                     uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr, ref_stride,
-                                       0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 1, ref_stride,
-                                       0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 2, ref_stride,
-                                       0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 3, ref_stride,
-                                       0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 4, ref_stride,
-                                       0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 5, ref_stride,
-                                       0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 6, ref_stride,
-                                       0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
-                                       ref_ptr + 7, ref_stride,
-                                       0x7fffffff);
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad4x4x3_c(const uint8_t *src_ptr,
@@ -354,31 +354,31 @@
                     int  src_stride,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
-                    uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr, ref_stride,
-                                      0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 1, ref_stride,
-                                      0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 2, ref_stride,
-                                      0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 3, ref_stride,
-                                      0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 4, ref_stride,
-                                      0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 5, ref_stride,
-                                      0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 6, ref_stride,
-                                      0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
-                                      ref_ptr + 7, ref_stride,
-                                      0x7fffffff);
+                    uint32_t *sad_array) {
+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr, ref_stride,
+                            0x7fffffff);
+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride,
+                            0x7fffffff);
+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride,
+                            0x7fffffff);
+  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 3, ref_stride,
+                            0x7fffffff);
+  sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 4, ref_stride,
+                            0x7fffffff);
+  sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 5, ref_stride,
+                            0x7fffffff);
+  sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 6, ref_stride,
+                            0x7fffffff);
+  sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 7, ref_stride,
+                            0x7fffffff);
 }
 
 void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -29,7 +29,7 @@
                                     int source_stride,
                                     const uint8_t *ref_ptr,
                                     int  ref_stride,
-                                    unsigned short *sad_array);
+                                    unsigned int *sad_array);
 
 typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
--- a/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/vp9/encoder/x86/vp9_sad_sse4.asm
@@ -154,7 +154,17 @@
         paddw           xmm1,       xmm5
 %endmacro
 
+%macro WRITE_AS_INTS 0
+    mov             rdi,        arg(4)           ;Results
+    pxor            xmm0, xmm0
+    movdqa          xmm2, xmm1
+    punpcklwd       xmm1, xmm0
+    punpckhwd       xmm2, xmm0
 
+    movdqa          [rdi],    xmm1
+    movdqa          [rdi + 16],    xmm2
+%endmacro
+
 ;void vp9_sad16x16x8_sse4(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
@@ -170,23 +180,22 @@
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -212,19 +221,18 @@
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -250,19 +258,18 @@
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -288,23 +295,23 @@
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
 
+    WRITE_AS_INTS
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -329,17 +336,16 @@
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
+    PROCESS_4X2X8 1
+    PROCESS_4X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
--