shithub: libvpx

Download patch

ref: 303f144eefc739c44a3e18a24f8cf61d2fe103a7
parent: 16a4fab9e2c6d80c08183bf0cc27a3e0b9bd2346
author: jackychen <jackychen@google.com>
date: Thu Dec 3 10:21:36 EST 2015

Add vp9_avg_4x4_neon and the unit test.

Change-Id: I3ef9a9648841374ed3cc865a02053c14ad821a20

--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -372,7 +372,10 @@
     ::testing::Values(
         make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
         make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, IntProRowTest, ::testing::Values(
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -198,7 +198,7 @@
 specialize qw/vp9_avg_8x8 sse2 neon msa/;
 
 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 msa/;
+specialize qw/vp9_avg_4x4 sse2 neon msa/;
 
 add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
 specialize qw/vp9_minmax_8x8 sse2/;
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -24,6 +24,18 @@
   return vget_lane_u32(c, 0);
 }
 
+unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
 unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
   uint8x8_t v_s0 = vld1_u8(s);
   const uint8x8_t v_s1 = vld1_u8(s + p);