shithub: libvpx

Download patch

ref: a7355f3bbba00902e529da71e2b0c9f8392253d7
parent: b259f52d4bb4ec554fd9b21357044fe5aa232d18
parent: 7afed9a1b6d3c3a17f690225ab4254a5583b7d46
author: Johann <johannkoenig@google.com>
date: Mon May 5 01:36:54 EDT 2014

Merge changes Iaf7d6b0a,Iece0bf56

* changes:
  Use INLINE and include vpx_config.h instead of plain 'inline'
  Use vreinterpret instead of casting neon vector types

--- a/vp8/common/arm/neon/loopfilter_neon.c
+++ b/vp8/common/arm/neon/loopfilter_neon.c
@@ -9,8 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include "./vpx_config.h"
 
-static inline void vp8_loop_filter_neon(
+static INLINE void vp8_loop_filter_neon(
         uint8x16_t qblimit,  // flimit
         uint8x16_t qlimit,   // limit
         uint8x16_t qthresh,  // thresh
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -9,8 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include "./vpx_config.h"
 
-static inline void vp8_loop_filter_simple_horizontal_edge_neon(
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
         unsigned char *s,
         int p,
         const unsigned char *blimit) {
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -9,8 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include "./vpx_config.h"
 
-static inline void vp8_loop_filter_simple_vertical_edge_neon(
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
         unsigned char *s,
         int p,
         const unsigned char *blimit) {
--- a/vp8/common/arm/neon/mbloopfilter_neon.c
+++ b/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -9,8 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include "./vpx_config.h"
 
-static inline void vp8_mbloop_filter_neon(
+static INLINE void vp8_mbloop_filter_neon(
         uint8x16_t qblimit,  // mblimit
         uint8x16_t qlimit,   // limit
         uint8x16_t qthresh,  // thresh
@@ -352,20 +353,28 @@
     q9 = vcombine_u8(d18, d19);
     q10 = vcombine_u8(d20, d21);
 
-    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
-    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
-    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
-    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
 
-    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
-    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
-    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
-    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
 
-    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
-    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
-    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
-    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
 
     q3 = q2tmp8.val[0];
     q4 = q2tmp8.val[1];
@@ -380,20 +389,28 @@
                          q5, q6, q7, q8, q9, q10,
                          &q4, &q5, &q6, &q7, &q8, &q9);
 
-    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
-    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
-    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
-    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
 
-    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
-    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
-    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
-    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
 
-    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
-    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
-    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
-    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
 
     q3 = q2tmp8.val[0];
     q4 = q2tmp8.val[1];
@@ -503,20 +520,28 @@
     q9 = vcombine_u8(d18, d19);
     q10 = vcombine_u8(d20, d21);
 
-    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
-    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
-    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
-    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
 
-    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
-    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
-    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
-    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
 
-    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
-    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
-    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
-    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
 
     q3 = q2tmp8.val[0];
     q4 = q2tmp8.val[1];
@@ -531,20 +556,28 @@
                          q5, q6, q7, q8, q9, q10,
                          &q4, &q5, &q6, &q7, &q8, &q9);
 
-    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
-    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
-    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
-    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
 
-    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
-    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
-    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
-    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
 
-    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
-    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
-    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
-    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
 
     q3 = q2tmp8.val[0];
     q4 = q2tmp8.val[1];
--