shithub: libvpx

--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm

+++ /dev/null

@@ -1,199 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_full_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_full_2x_neon(short *q, short *dq,

-;                               unsigned char *dst, int stride);

-; r0    *q,

-; r1    *dq,

-; r2    *dst

-; r3    stride

-|idct_dequant_full_2x_neon| PROC

-    vpush           {d8-d15}

-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

-    vld1.16         {q2, q3}, [r0]          ; l q

-    add             r0, r0, #32

-    vld1.16         {q4, q5}, [r0]          ; r q

-    add             r12, r2, #4

-    ; interleave the predictors

-    vld1.32         {d28[0]}, [r2],  r3     ; l pre

-    vld1.32         {d28[1]}, [r12], r3     ; r pre

-    vld1.32         {d29[0]}, [r2],  r3

-    vld1.32         {d29[1]}, [r12], r3

-    vld1.32         {d30[0]}, [r2],  r3

-    vld1.32         {d30[1]}, [r12], r3

-    vld1.32         {d31[0]}, [r2],  r3

-    vld1.32         {d31[1]}, [r12]

-    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant

-    ; dequant: q[i] = q[i] * dq[i]

-    vmul.i16        q2, q2, q0

-    vmul.i16        q3, q3, q1

-    vmul.i16        q4, q4, q0

-    vmul.i16        q5, q5, q1

-    vld1.16         {d0}, [r1]

-    ; q2: l0r0  q3: l8r8

-    ; q4: l4r4  q5: l12r12

-    vswp            d5, d8

-    vswp            d7, d10

-    ; _CONSTANTS_ * 4,12 >> 16

-    ; q6:  4 * sinpi : c1/temp1

-    ; q7: 12 * sinpi : d1/temp2

-    ; q8:  4 * cospi

-    ; q9: 12 * cospi

-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q7, q5, d0[2]

-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

-    vqdmulh.s16     q9, q5, d0[0]

-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

-    ; vqdmulh only accepts signed values. this was a problem because

-    ; our constant had the high bit set, and was treated as a negative value.

-    ; vqdmulh also doubles the value before it shifts by 16. we need to

-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

-    ; so we can shift the constant without losing precision. this avoids

-    ; shift again afterward, but also avoids the sign issue. win win!

-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

-    ; pre-shift it

-    vshr.s16        q8, q8, #1

-    vshr.s16        q9, q9, #1

-    ; q4:  4 +  4 * cospi : d1/temp1

-    ; q5: 12 + 12 * cospi : c1/temp2

-    vqadd.s16       q4, q4, q8

-    vqadd.s16       q5, q5, q9

-    ; c1 = temp1 - temp2

-    ; d1 = temp1 + temp2

-    vqsub.s16       q2, q6, q5

-    vqadd.s16       q3, q4, q7

-    ; [0]: a1+d1

-    ; [1]: b1+c1

-    ; [2]: b1-c1

-    ; [3]: a1-d1

-    vqadd.s16       q4, q10, q3

-    vqadd.s16       q5, q11, q2

-    vqsub.s16       q6, q11, q2

-    vqsub.s16       q7, q10, q3

-    ; rotate

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; idct loop 2

-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

-    ; q6: l 2, 6,10,14 r 2, 6,10,14

-    ; q7: l 3, 7,11,15 r 3, 7,11,15

-    ; q8:  1 * sinpi : c1/temp1

-    ; q9:  3 * sinpi : d1/temp2

-    ; q10: 1 * cospi

-    ; q11: 3 * cospi

-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q9, q7, d0[2]

-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

-    vqdmulh.s16     q11, q7, d0[0]

-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

-    ; see note on shifting above

-    vshr.s16        q10, q10, #1

-    vshr.s16        q11, q11, #1

-    ; q10: 1 + 1 * cospi : d1/temp1

-    ; q11: 3 + 3 * cospi : c1/temp2

-    vqadd.s16       q10, q5, q10

-    vqadd.s16       q11, q7, q11

-    ; q8: c1 = temp1 - temp2

-    ; q9: d1 = temp1 + temp2

-    vqsub.s16       q8, q8, q11

-    vqadd.s16       q9, q10, q9

-    ; a1+d1

-    ; b1+c1

-    ; b1-c1

-    ; a1-d1

-    vqadd.s16       q4, q2, q9

-    vqadd.s16       q5, q3, q8

-    vqsub.s16       q6, q3, q8

-    vqsub.s16       q7, q2, q9

-    ; +4 >> 3 (rounding)

-    vrshr.s16       q4, q4, #3              ; lo

-    vrshr.s16       q5, q5, #3

-    vrshr.s16       q6, q6, #3              ; hi

-    vrshr.s16       q7, q7, #3

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; adding pre

-    ; input is still packed. pre was read interleaved

-    vaddw.u8        q4, q4, d28

-    vaddw.u8        q5, q5, d29

-    vaddw.u8        q6, q6, d30

-    vaddw.u8        q7, q7, d31

-    vmov.i16        q14, #0

-    vmov            q15, q14

-    vst1.16         {q14, q15}, [r0]        ; write over high input

-    sub             r0, r0, #32

-    vst1.16         {q14, q15}, [r0]        ; write over low input

-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride

-    add             r1, r2, #4              ; hi

-    ;saturate and narrow

-    vqmovun.s16     d0, q4                  ; lo

-    vqmovun.s16     d1, q5

-    vqmovun.s16     d2, q6                  ; hi

-    vqmovun.s16     d3, q7

-    vst1.32         {d0[0]}, [r2], r3       ; lo

-    vst1.32         {d0[1]}, [r1], r3       ; hi

-    vst1.32         {d1[0]}, [r2], r3

-    vst1.32         {d1[1]}, [r1], r3

-    vst1.32         {d2[0]}, [r2], r3

-    vst1.32         {d2[1]}, [r1], r3

-    vst1.32         {d3[0]}, [r2]

-    vst1.32         {d3[1]}, [r1]

-    vpop            {d8-d15}

-    bx             lr

-    ENDP           ; |idct_dequant_full_2x_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b

-; because the lowest bit in 0x8a8c is 0, we can pre-shift this

-sinpi8sqrt2       DCD 0x4546

-    END

--- /dev/null

+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c

@@ -1,0 +1,185 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+static const int16_t cospi8sqrt2minus1 = 20091;

+static const int16_t sinpi8sqrt2       = 17734;

+// because the lowest bit in 0x8a8c is 0, we can pre-shift this

+void idct_dequant_full_2x_neon(

+        int16_t *q,

+        int16_t *dq,

+        unsigned char *dst,

+        int stride) {

+    unsigned char *dst0, *dst1;

+    int32x2_t d28, d29, d30, d31;

+    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;

+    int16x8_t qEmpty = vdupq_n_s16(0);

+    int32x4x2_t q2tmp0, q2tmp1;

+    int16x8x2_t q2tmp2, q2tmp3;

+    int16x4_t dLow0, dLow1, dHigh0, dHigh1;

+    d28 = d29 = d30 = d31 = vdup_n_s32(0);

+    // load dq

+    q0 = vld1q_s16(dq);

+    dq += 8;

+    q1 = vld1q_s16(dq);

+    // load q

+    q2 = vld1q_s16(q);

+    vst1q_s16(q, qEmpty);

+    q += 8;

+    q3 = vld1q_s16(q);

+    vst1q_s16(q, qEmpty);

+    q += 8;

+    q4 = vld1q_s16(q);

+    vst1q_s16(q, qEmpty);

+    q += 8;

+    q5 = vld1q_s16(q);

+    vst1q_s16(q, qEmpty);

+    // load src from dst

+    dst0 = dst;

+    dst1 = dst + 4;

+    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);

+    dst0 += stride;

+    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);

+    dst1 += stride;

+    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);

+    dst0 += stride;

+    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);

+    dst1 += stride;

+    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);

+    dst0 += stride;

+    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);

+    dst1 += stride;

+    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);

+    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);

+    q2 = vmulq_s16(q2, q0);

+    q3 = vmulq_s16(q3, q1);

+    q4 = vmulq_s16(q4, q0);

+    q5 = vmulq_s16(q5, q1);

+    // vswp

+    dLow0 = vget_low_s16(q2);

+    dHigh0 = vget_high_s16(q2);

+    dLow1 = vget_low_s16(q4);

+    dHigh1 = vget_high_s16(q4);

+    q2 = vcombine_s16(dLow0, dLow1);

+    q4 = vcombine_s16(dHigh0, dHigh1);

+    dLow0 = vget_low_s16(q3);

+    dHigh0 = vget_high_s16(q3);

+    dLow1 = vget_low_s16(q5);

+    dHigh1 = vget_high_s16(q5);

+    q3 = vcombine_s16(dLow0, dLow1);

+    q5 = vcombine_s16(dHigh0, dHigh1);

+    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);

+    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);

+    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);

+    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);

+    q10 = vqaddq_s16(q2, q3);

+    q11 = vqsubq_s16(q2, q3);

+    q8 = vshrq_n_s16(q8, 1);

+    q9 = vshrq_n_s16(q9, 1);

+    q4 = vqaddq_s16(q4, q8);

+    q5 = vqaddq_s16(q5, q9);

+    q2 = vqsubq_s16(q6, q5);

+    q3 = vqaddq_s16(q7, q4);

+    q4 = vqaddq_s16(q10, q3);

+    q5 = vqaddq_s16(q11, q2);

+    q6 = vqsubq_s16(q11, q2);

+    q7 = vqsubq_s16(q10, q3);

+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));

+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));

+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),

+                       vreinterpretq_s16_s32(q2tmp1.val[0]));

+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),

+                       vreinterpretq_s16_s32(q2tmp1.val[1]));

+    // loop 2

+    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);

+    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);

+    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);

+    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);

+    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);

+    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);

+    q10 = vshrq_n_s16(q10, 1);

+    q11 = vshrq_n_s16(q11, 1);

+    q10 = vqaddq_s16(q2tmp2.val[1], q10);

+    q11 = vqaddq_s16(q2tmp3.val[1], q11);

+    q8 = vqsubq_s16(q8, q11);

+    q9 = vqaddq_s16(q9, q10);

+    q4 = vqaddq_s16(q2, q9);

+    q5 = vqaddq_s16(q3, q8);

+    q6 = vqsubq_s16(q3, q8);

+    q7 = vqsubq_s16(q2, q9);

+    q4 = vrshrq_n_s16(q4, 3);

+    q5 = vrshrq_n_s16(q5, 3);

+    q6 = vrshrq_n_s16(q6, 3);

+    q7 = vrshrq_n_s16(q7, 3);

+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));

+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));

+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),

+                       vreinterpretq_s16_s32(q2tmp1.val[0]));

+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),

+                       vreinterpretq_s16_s32(q2tmp1.val[1]));

+    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),

+                                          vreinterpret_u8_s32(d28)));

+    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),

+                                          vreinterpret_u8_s32(d29)));

+    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),

+                                          vreinterpret_u8_s32(d30)));

+    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),

+                                          vreinterpret_u8_s32(d31)));

+    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));

+    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));

+    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));

+    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));

+    dst0 = dst;

+    dst1 = dst + 4;

+    vst1_lane_s32((int32_t *)dst0, d28, 0);

+    dst0 += stride;

+    vst1_lane_s32((int32_t *)dst1, d28, 1);

+    dst1 += stride;

+    vst1_lane_s32((int32_t *)dst0, d29, 0);

+    dst0 += stride;

+    vst1_lane_s32((int32_t *)dst1, d29, 1);

+    dst1 += stride;

+    vst1_lane_s32((int32_t *)dst0, d30, 0);

+    dst0 += stride;

+    vst1_lane_s32((int32_t *)dst1, d30, 1);

+    dst1 += stride;

+    vst1_lane_s32((int32_t *)dst0, d31, 0);

+    vst1_lane_s32((int32_t *)dst1, d31, 1);

+    return;

+}

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -172,7 +172,6 @@

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x8_neon$(ASM)

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict16x16_neon$(ASM)

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)

@@ -186,6 +185,7 @@

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c

+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c

 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

--

⑨