shithub: libvpx

Download patch

ref: cddbdeabd00e212d5eb3b67c9d09f40e499163d2
parent: c38d0490b3ecfa4f9a6c4613490ff8ce76569df6
parent: 8f9d94ec17eea893ce35188416a9492317119d77
author: Yunqing Wang <yunqingwang@google.com>
date: Mon Dec 8 08:34:53 EST 2014

Merge "SSSE3 Optimization for Atom processors using new instruction selection and ordering"

--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -765,40 +765,50 @@
 
     movq        xmm0,   [rsi - 3]           ;load src data
     movq        xmm4,   [rsi + 5]
-    movq        xmm7,   [rsi + 13]
+    movq        xmm6,   [rsi + 13]
     punpcklqdq  xmm0,   xmm4
-    punpcklqdq  xmm4,   xmm7
+    punpcklqdq  xmm4,   xmm6
 
+    movdqa      xmm7,   xmm0
+
+    punpcklbw   xmm7,   xmm7
+    punpckhbw   xmm0,   xmm0
     movdqa      xmm1,   xmm0
     movdqa      xmm2,   xmm0
     movdqa      xmm3,   xmm0
-    movdqa      xmm5,   xmm4
-    movdqa      xmm6,   xmm4
-    movdqa      xmm7,   xmm4
 
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
-    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
-
+    palignr     xmm0,   xmm7, 1
+    palignr     xmm1,   xmm7, 5
     pmaddubsw   xmm0,   k0k1
+    palignr     xmm2,   xmm7, 9
     pmaddubsw   xmm1,   k2k3
+    palignr     xmm3,   xmm7, 13
+
     pmaddubsw   xmm2,   k4k5
     pmaddubsw   xmm3,   k6k7
-    pmaddubsw   xmm4,   k0k1
-    pmaddubsw   xmm5,   k2k3
-    pmaddubsw   xmm6,   k4k5
-    pmaddubsw   xmm7,   k6k7
-
     paddsw      xmm0,   xmm3
+
+    movdqa      xmm3,   xmm4
+    punpcklbw   xmm3,   xmm3
+    punpckhbw   xmm4,   xmm4
+
+    movdqa      xmm5,   xmm4
+    movdqa      xmm6,   xmm4
+    movdqa      xmm7,   xmm4
+
+    palignr     xmm4,   xmm3, 1
+    palignr     xmm5,   xmm3, 5
+    palignr     xmm6,   xmm3, 9
+    palignr     xmm7,   xmm3, 13
+
     movdqa      xmm3,   xmm1
+    pmaddubsw   xmm4,   k0k1
     pmaxsw      xmm1,   xmm2
+    pmaddubsw   xmm5,   k2k3
     pminsw      xmm2,   xmm3
+    pmaddubsw   xmm6,   k4k5
     paddsw      xmm0,   xmm2
+    pmaddubsw   xmm7,   k6k7
     paddsw      xmm0,   xmm1
 
     paddsw      xmm4,   xmm7