shithub: libvpx

Download patch

ref: 016933ad48af499ad7e928ca32ba2fab692273df
parent: 91f87e75135cc9722ee72daf5154424f628a906e
author: Linfeng Zhang <linfengz@google.com>
date: Fri Jan 13 05:01:51 EST 2017

Add vpx_highbd_idct{16x16,32x32}_1_add_neon()

and update vpx_highbd_idct8x8_1_add_neon()

BUG=webm:1301

Change-Id: I18d1a0cbe98ba822d5194c1b4e13a4c29c5c75f4

--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -450,6 +450,24 @@
 #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 const PartialInvTxfmParam neon_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 12, 2),
   make_tuple(&vpx_highbd_fdct8x8_c,
              &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
              &highbd_wrapper<vpx_highbd_idct8x8_64_add_neon>, TX_8X8, 64, 8, 2),
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -1,0 +1,73 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  vst1q_u16(*dest, c0);
+  vst1q_u16(*dest + 8, c1);
+  *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  const tran_low_t out0 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  const tran_low_t out1 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -1,0 +1,89 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  const int16x8_t c2 = vminq_s16(b2, max);
+  const int16x8_t c3 = vminq_s16(b3, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+  const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+  vst1q_u16(*dest, c0);
+  vst1q_u16(*dest + 8, c1);
+  vst1q_u16(*dest + 16, c2);
+  vst1q_u16(*dest + 24, c3);
+  *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  const tran_low_t out0 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  const tran_low_t out1 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -15,21 +15,29 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
-                                               const int stride,
-                                               const int16x8_t res,
-                                               const int16x8_t max) {
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res,
+                                                   const int16x8_t max) {
   const uint16x8_t a = vld1q_u16(*dest);
   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
   const int16x8_t c = vminq_s16(b, max);
-  const uint16x8_t d = vqshluq_n_s16(c, 0);
-  vst1q_u16(*dest, d);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c));
   *dest += stride;
 }
 
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res) {
+  const uint16x8_t a = vld1q_u16(*dest);
+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+  const uint16x8_t c = vqshluq_n_s16(b, 0);
+  vst1q_u16(*dest, c);
+  *dest += stride;
+}
+
 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
   const tran_low_t out0 =
       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   const tran_low_t out1 =
@@ -38,14 +46,26 @@
   const int16x8_t dc = vdupq_n_s16(a1);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+  } else {
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+  }
 }
 
 static INLINE void idct8x8_12_half1d_bd10(
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -224,6 +224,8 @@
 else  # CONFIG_VP9_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -624,6 +624,7 @@
   specialize qw/vpx_highbd_idct8x8_1_add neon/;
 
   add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct16x16_1_add neon/;
 
   add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
@@ -630,7 +631,7 @@
   add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
   add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct32x32_1_add sse2/;
+  specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
 
   add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";