ref: 7edb5e8a16caf79c84281c117a9b2168326f8d87
parent: 2d3e33388211d2f0539900671a87a874e25e5240
author: Linfeng Zhang <linfengz@google.com>
date: Tue May 8 13:37:18 EDT 2018
Update vpx_comp_avg_pred_neon() Separate width 4 and 8 cases to reduce jumps in loop in clang. Change-Id: I6ffc6f1555f2ad08b72a8dba35a78b9fd5f95a73
--- a/vpx_dsp/arm/avg_pred_neon.c
+++ b/vpx_dsp/arm/avg_pred_neon.c
@@ -17,8 +17,8 @@
void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
if (width > 8) {
- int x, y;
- for (y = 0; y < height; ++y) {
+ int x, y = height;
+ do {
for (x = 0; x < width; x += 16) {
const uint8x16_t p = vld1q_u8(pred + x);
const uint8x16_t r = vld1q_u8(ref + x);
@@ -28,28 +28,38 @@
comp += width;
pred += width;
ref += ref_stride;
- }
+ } while (--y);
+ } else if (width == 8) {
+ int i = width * height;
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+ const uint8x8_t r_0 = vld1_u8(ref);
+ const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+ r = vcombine_u8(r_0, r_1);
+ ref += 2 * ref_stride;
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ i -= 16;
+ } while (i);
} else {
- int i;
- for (i = 0; i < width * height; i += 16) {
+ int i = width * height;
+ assert(width == 4);
+ do {
const uint8x16_t p = vld1q_u8(pred);
uint8x16_t r;
- if (width == 4) {
- r = load_unaligned_u8q(ref, ref_stride);
- ref += 4 * ref_stride;
- } else {
- const uint8x8_t r_0 = vld1_u8(ref);
- const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
- assert(width == 8);
- r = vcombine_u8(r_0, r_1);
- ref += 2 * ref_stride;
- }
+ r = load_unaligned_u8q(ref, ref_stride);
+ ref += 4 * ref_stride;
r = vrhaddq_u8(r, p);
vst1q_u8(comp, r);
pred += 16;
comp += 16;
- }
+ i -= 16;
+ } while (i);
}
}