shithub: libvpx

Download patch

ref: d217c87139a3218d9dc4154782de53b9d0cc1119
parent: e7cac130167c1da6d17caa33e216250d989d0fe8
author: Johann <johannkoenig@google.com>
date: Mon May 15 12:30:00 EDT 2017

neon variance: special case 4x

The sub pixel variance uses a temp buffer which guarantees width ==
stride. Take advantage of this with the 4x and avoid the very costly
lane loads.

Change-Id: Ia0c97eb8c29dc8dfa6e51a29dff9b75b3c6726f1

--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -83,6 +83,7 @@
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   uint32_t a;
   uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vld1q_lane_u32(&a, a_u32, 0);
@@ -102,6 +103,10 @@
 static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+  if (stride == 4) {
+    vst1q_u8(buf, a);
+    return;
+  }
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
   buf += stride;
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));