shithub: libvpx

Download patch

ref: eab09e34e3e3cd0da671b4dbb7fd75e976915fa7
parent: cdc35037a538066bc3c14ba784d258d7d2aef4e6
parent: 147e864629623a4a0f66f40e0a717b60ed4a6377
author: Yunqing Wang <yunqingwang@google.com>
date: Tue Jul 3 11:55:02 EDT 2012

Merge "Add 0 offsets handling in SSSE3 sixtap_predict functions"

--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -438,19 +438,35 @@
     {
         if (yoffset)
         {
-            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
-            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
+            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line, FData2,
+                                          16, 21, xoffset);
+            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+                                          16, yoffset);
         }
         else
         {
             /* First-pass only */
-            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
+            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, xoffset);
         }
     }
     else
     {
-        /* Second-pass only */
-        vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
     }
 }
 
@@ -470,18 +486,34 @@
     {
         if (yoffset)
         {
-            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
-            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 13, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         8, yoffset);
         }
         else
         {
-            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, xoffset);
         }
     }
     else
     {
-        /* Second-pass only */
-        vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
     }
 }
 
@@ -502,19 +534,35 @@
     {
         if (yoffset)
         {
-            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
-            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 9, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         4, yoffset);
         }
         else
         {
             /* First-pass only */
-            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, xoffset);
         }
     }
     else
     {
-        /* Second-pass only */
-        vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
     }
 }
 
@@ -534,19 +582,48 @@
   {
       if (yoffset)
       {
-          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
-          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
+          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       FData2, 4, 9, xoffset);
+          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+                                       4, yoffset);
       }
       else
       {
-          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
+          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, xoffset);
       }
   }
   else
   {
-      vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
-  }
+      if (yoffset)
+      {
+          vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, yoffset);
+      }
+      else
+      {
+        /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+          * yoffset==0) case correctly. Add copy function here to guarantee
+          * six-tap function handles all possible offsets. */
+          int r;
 
+          for (r = 0; r < 4; r++)
+          {
+  #if !(CONFIG_FAST_UNALIGNED)
+            dst_ptr[0]  = src_ptr[0];
+            dst_ptr[1]  = src_ptr[1];
+            dst_ptr[2]  = src_ptr[2];
+            dst_ptr[3]  = src_ptr[3];
+  #else
+              *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
+  #endif
+              dst_ptr     += dst_pitch;
+              src_ptr     += src_pixels_per_line;
+          }
+      }
+  }
 }
 
 #endif