ref: caac87b05b673fa02609c7abe7fee2cf6d68369e
parent: 476e8fc8558592f5535ec2bcdfc6798d35f65f12
parent: df69c751a7552fa162fbcf64da14830c753342f3
author: Johann Koenig <johannkoenig@google.com>
date: Thu Aug 4 15:55:49 EDT 2016
Merge "Don't expand to Q register for 4x4 intrapred"
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -20,37 +20,35 @@
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
const uint8_t *left, int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
+ uint16x4_t sum_top;
+ uint16x4_t sum_left;
+ uint16x4_t dc0;
if (do_above) {
const uint8x8_t A = vld1_u8(above); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- sum_top = vcombine_u16(p1, p1);
+ sum_top = vpadd_u16(p0, p0);
}
if (do_left) {
const uint8x8_t L = vld1_u8(left); // left border
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- sum_left = vcombine_u16(p1, p1);
+ sum_left = vpadd_u16(p0, p0);
}
if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 3);
+ const uint16x4_t sum = vadd_u16(sum_left, sum_top);
+ dc0 = vrshr_n_u16(sum, 3);
} else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 2);
+ dc0 = vrshr_n_u16(sum_top, 2);
} else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 2);
+ dc0 = vrshr_n_u16(sum_left, 2);
} else {
- dc0 = vdup_n_u8(0x80);
+ dc0 = vdup_n_u16(0x80);
}
{
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0);
int i;
for (i = 0; i < 4; ++i) {
vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);