ref: 282f36adc4d87539ef204cfc05049f50fed9734b
parent: cabfd505a44b9157550c0853e891a2b10b93a745
parent: c49b08c9a11d032849582cc17641fbbcfaabfa39
author: Dmitry Kovalev <dkovalev@google.com>
date: Mon Feb 3 09:28:47 EST 2014
Merge "Removing "_short" suffix from arm transform file names."
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
@@ -1,0 +1,198 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_idct16x16_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct16x16_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 6)
+ add r0, r0, #32 ; + (1 <<((6) - 1))
+ asr r0, r0, #6 ; >> 6
+
+ vdup.s16 q0, r0 ; duplicate a1
+ mov r0, #8
+ sub r2, #8
+
+ ; load destination data row0 - row3
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row4 - row7
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row8 - row11
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row12 - row15
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |vp9_idct16x16_1_add_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
@@ -1,0 +1,1179 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_idct16x16_256_add_neon_pass1|
+ EXPORT |vp9_idct16x16_256_add_neon_pass2|
+ EXPORT |vp9_idct16x16_10_add_neon_pass1|
+ EXPORT |vp9_idct16x16_10_add_neon_pass2|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
+; int16_t *output, int output_stride)
+;
+; r0 int16_t input
+; r1 int16_t *output
+; r2 int output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_idct16x16_256_add_neon_pass1| PROC
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q1,q2}, [r0]!
+ vmov.s16 q15, q1
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0xc00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r12, #0x3e00
+ add r12, #0xc5
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r12 ; duplicate cospi_4_64
+
+ ; preloading to avoid stall
+ ; generate cospi_12_64 = 13623
+ mov r3, #0x3500
+ add r3, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r12, #0x2300
+ add r12, #0x8e
+
+ ; step2[4] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; step2[4] * cospi_4_64
+ vmull.s16 q5, d18, d1
+ vmull.s16 q6, d19, d1
+
+ ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
+ vmlsl.s16 q2, d30, d1
+ vmlsl.s16 q3, d31, d1
+
+ ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
+ vmlal.s16 q5, d30, d0
+ vmlal.s16 q6, d31, d0
+
+ vdup.16 d2, r3 ; duplicate cospi_12_64
+ vdup.16 d3, r12 ; duplicate cospi_20_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d8, q2, #14 ; >> 14
+ vqrshrn.s32 d9, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d14, q5, #14 ; >> 14
+ vqrshrn.s32 d15, q6, #14 ; >> 14
+
+ ; preloading to avoid stall
+ ; generate cospi_16_64 = 11585
+ mov r3, #0x2d00
+ add r3, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r12, #0x1800
+ add r12, #0x7e
+
+ ; step2[5] * cospi_12_64
+ vmull.s16 q2, d26, d2
+ vmull.s16 q3, d27, d2
+
+ ; step2[5] * cospi_20_64
+ vmull.s16 q9, d26, d3
+ vmull.s16 q15, d27, d3
+
+ ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vmlsl.s16 q2, d22, d3
+ vmlsl.s16 q3, d23, d3
+
+ ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
+ vmlal.s16 q9, d22, d2
+ vmlal.s16 q15, d23, d2
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d10, q2, #14 ; >> 14
+ vqrshrn.s32 d11, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q15, #14 ; >> 14
+
+ ; stage 4
+ vdup.16 d30, r3 ; cospi_16_64
+
+ ; step1[0] * cospi_16_64
+ vmull.s16 q2, d16, d30
+ vmull.s16 q11, d17, d30
+
+ ; step1[1] * cospi_16_64
+ vmull.s16 q0, d24, d30
+ vmull.s16 q1, d25, d30
+
+ ; generate cospi_8_64 = 15137
+ mov r3, #0x3b00
+ add r3, #0x21
+
+ vdup.16 d30, r12 ; duplicate cospi_24_64
+ vdup.16 d31, r3 ; duplicate cospi_8_64
+
+ ; temp1 = (step1[0] + step1[1]) * cospi_16_64
+ vadd.s32 q3, q2, q0
+ vadd.s32 q12, q11, q1
+
+ ; temp2 = (step1[0] - step1[1]) * cospi_16_64
+ vsub.s32 q13, q2, q0
+ vsub.s32 q1, q11, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d16, q3, #14 ; >> 14
+ vqrshrn.s32 d17, q12, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d18, q13, #14 ; >> 14
+ vqrshrn.s32 d19, q1, #14 ; >> 14
+
+ ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ ; step1[2] * cospi_8_64
+ vmull.s16 q0, d20, d31
+ vmull.s16 q1, d21, d31
+
+ ; step1[2] * cospi_24_64
+ vmull.s16 q12, d20, d30
+ vmull.s16 q13, d21, d30
+
+ ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vmlal.s16 q0, d28, d30
+ vmlal.s16 q1, d29, d30
+
+ ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlsl.s16 q12, d28, d31
+ vmlsl.s16 q13, d29, d31
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d22, q0, #14 ; >> 14
+ vqrshrn.s32 d23, q1, #14 ; >> 14
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d20, q12, #14 ; >> 14
+ vqrshrn.s32 d21, q13, #14 ; >> 14
+
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5];
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5];
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7];
+ vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7];
+
+ ; generate cospi_16_64 = 11585
+ mov r3, #0x2d00
+ add r3, #0x41
+
+ ; stage 5
+ vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3];
+ vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2];
+ vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2];
+ vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3];
+
+ vdup.16 d16, r3; ; duplicate cospi_16_64
+
+ ; step2[5] * cospi_16_64
+ vmull.s16 q11, d26, d16
+ vmull.s16 q12, d27, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+ vsub.s32 q6, q9, q11
+ vsub.s32 q13, q10, q12
+
+ ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+ vadd.s32 q9, q9, q11
+ vadd.s32 q10, q10, q12
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d10, q6, #14 ; >> 14
+ vqrshrn.s32 d11, q13, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q10, #14 ; >> 14
+
+ ; stage 6
+ vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7];
+
+ ; store the data
+ vst1.64 {d16}, [r1], r2
+ vst1.64 {d17}, [r1], r2
+ vst1.64 {d18}, [r1], r2
+ vst1.64 {d19}, [r1], r2
+ vst1.64 {d20}, [r1], r2
+ vst1.64 {d21}, [r1], r2
+ vst1.64 {d22}, [r1], r2
+ vst1.64 {d23}, [r1], r2
+ vst1.64 {d24}, [r1], r2
+ vst1.64 {d25}, [r1], r2
+ vst1.64 {d26}, [r1], r2
+ vst1.64 {d27}, [r1], r2
+ vst1.64 {d28}, [r1], r2
+ vst1.64 {d29}, [r1], r2
+ vst1.64 {d30}, [r1], r2
+ vst1.64 {d31}, [r1], r2
+
+ bx lr
+ ENDP ; |vp9_idct16x16_256_add_neon_pass1|
+
+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
+; int16_t *output,
+; int16_t *pass1Output,
+; int16_t skip_adding,
+; uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t *src
+; r1 int16_t *output,
+; r2 int16_t *pass1Output,
+; r3 int16_t skip_adding,
+; r4 uint8_t *dest,
+; r5 int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_idct16x16_256_add_neon_pass2| PROC
+ push {r3-r9}
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q0,q1}, [r0]!
+ vmov.s16 q15, q0;
+
+ ; generate cospi_30_64 = 1606
+ mov r3, #0x0600
+ add r3, #0x46
+
+ ; generate cospi_2_64 = 16305
+ mov r12, #0x3f00
+ add r12, #0xb1
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 d12, r3 ; duplicate cospi_30_64
+ vdup.16 d13, r12 ; duplicate cospi_2_64
+
+ ; preloading to avoid stall
+ ; generate cospi_14_64 = 12665
+ mov r3, #0x3100
+ add r3, #0x79
+
+ ; generate cospi_18_64 = 10394
+ mov r12, #0x2800
+ add r12, #0x9a
+
+ ; step1[8] * cospi_30_64
+ vmull.s16 q2, d16, d12
+ vmull.s16 q3, d17, d12
+
+ ; step1[8] * cospi_2_64
+ vmull.s16 q1, d16, d13
+ vmull.s16 q4, d17, d13
+
+ ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
+ vmlsl.s16 q2, d30, d13
+ vmlsl.s16 q3, d31, d13
+
+ ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
+ vmlal.s16 q1, d30, d12
+ vmlal.s16 q4, d31, d12
+
+ vdup.16 d30, r3 ; duplicate cospi_14_64
+ vdup.16 d31, r12 ; duplicate cospi_18_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d0, q2, #14 ; >> 14
+ vqrshrn.s32 d1, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d14, q1, #14 ; >> 14
+ vqrshrn.s32 d15, q4, #14 ; >> 14
+
+ ; preloading to avoid stall
+ ; generate cospi_22_64 = 7723
+ mov r3, #0x1e00
+ add r3, #0x2b
+
+ ; generate cospi_10_64 = 14449
+ mov r12, #0x3800
+ add r12, #0x71
+
+ ; step1[9] * cospi_14_64
+ vmull.s16 q2, d24, d30
+ vmull.s16 q3, d25, d30
+
+ ; step1[9] * cospi_18_64
+ vmull.s16 q4, d24, d31
+ vmull.s16 q5, d25, d31
+
+ ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
+ vmlsl.s16 q2, d22, d31
+ vmlsl.s16 q3, d23, d31
+
+ ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
+ vmlal.s16 q4, d22, d30
+ vmlal.s16 q5, d23, d30
+
+ vdup.16 d30, r3 ; duplicate cospi_22_64
+ vdup.16 d31, r12 ; duplicate cospi_10_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d2, q2, #14 ; >> 14
+ vqrshrn.s32 d3, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q4, #14 ; >> 14
+ vqrshrn.s32 d13, q5, #14 ; >> 14
+
+ ; step1[10] * cospi_22_64
+ vmull.s16 q11, d20, d30
+ vmull.s16 q12, d21, d30
+
+ ; step1[10] * cospi_10_64
+ vmull.s16 q4, d20, d31
+ vmull.s16 q5, d21, d31
+
+ ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
+ vmlsl.s16 q11, d26, d31
+ vmlsl.s16 q12, d27, d31
+
+ ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
+ vmlal.s16 q4, d26, d30
+ vmlal.s16 q5, d27, d30
+
+ ; preloading to avoid stall
+ ; generate cospi_6_64 = 15679
+ mov r3, #0x3d00
+ add r3, #0x3f
+
+ ; generate cospi_26_64 = 4756
+ mov r12, #0x1200
+ add r12, #0x94
+
+ vdup.16 d30, r3 ; duplicate cospi_6_64
+ vdup.16 d31, r12 ; duplicate cospi_26_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q11, #14 ; >> 14
+ vqrshrn.s32 d5, q12, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d11, q5, #14 ; >> 14
+ vqrshrn.s32 d10, q4, #14 ; >> 14
+
+ ; step1[11] * cospi_6_64
+ vmull.s16 q10, d28, d30
+ vmull.s16 q11, d29, d30
+
+ ; step1[11] * cospi_26_64
+ vmull.s16 q12, d28, d31
+ vmull.s16 q13, d29, d31
+
+ ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
+ vmlsl.s16 q10, d18, d31
+ vmlsl.s16 q11, d19, d31
+
+ ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
+ vmlal.s16 q12, d18, d30
+ vmlal.s16 q13, d19, d30
+
+ vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9]
+ vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9]
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d6, q10, #14 ; >> 14
+ vqrshrn.s32 d7, q11, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d8, q12, #14 ; >> 14
+ vqrshrn.s32 d9, q13, #14 ; >> 14
+
+ ; stage 3
+ vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11]
+ vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11]
+ vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13]
+ vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13]
+ vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15]
+ vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15]
+
+ ; stage 4
+ ; generate cospi_24_64 = 6270
+ mov r3, #0x1800
+ add r3, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r12, #0x3b00
+ add r12, #0x21
+
+ ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+ vdup.16 d30, r12 ; duplicate cospi_8_64
+ vdup.16 d31, r3 ; duplicate cospi_24_64
+
+ ; step1[9] * cospi_24_64
+ vmull.s16 q2, d18, d31
+ vmull.s16 q3, d19, d31
+
+ ; step1[14] * cospi_24_64
+ vmull.s16 q4, d28, d31
+ vmull.s16 q5, d29, d31
+
+ ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+ vmlal.s16 q2, d28, d30
+ vmlal.s16 q3, d29, d30
+
+ ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+ vmlsl.s16 q4, d18, d30
+ vmlsl.s16 q5, d19, d30
+
+ rsb r12, #0
+ vdup.16 d30, r12 ; duplicate -cospi_8_64
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q2, #14 ; >> 14
+ vqrshrn.s32 d13, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d2, q4, #14 ; >> 14
+ vqrshrn.s32 d3, q5, #14 ; >> 14
+
+ vmov.s16 q3, q11
+ vmov.s16 q4, q12
+
+ ; - step1[13] * cospi_8_64
+ vmull.s16 q11, d26, d30
+ vmull.s16 q12, d27, d30
+
+ ; -step1[10] * cospi_8_64
+ vmull.s16 q8, d20, d30
+ vmull.s16 q9, d21, d30
+
+ ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+ vmlsl.s16 q11, d20, d31
+ vmlsl.s16 q12, d21, d31
+
+ ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+ vmlal.s16 q8, d26, d31
+ vmlal.s16 q9, d27, d31
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d4, q11, #14 ; >> 14
+ vqrshrn.s32 d5, q12, #14 ; >> 14
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d10, q8, #14 ; >> 14
+ vqrshrn.s32 d11, q9, #14 ; >> 14
+
+ ; stage 5
+ vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
+ vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
+ vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
+ vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
+ vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
+ vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
+ vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
+ vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
+
+ ; stage 6.
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ vdup.16 d14, r12 ; duplicate cospi_16_64
+
+ ; step1[13] * cospi_16_64
+ vmull.s16 q3, d26, d14
+ vmull.s16 q4, d27, d14
+
+ ; step1[10] * cospi_16_64
+ vmull.s16 q0, d20, d14
+ vmull.s16 q1, d21, d14
+
+ ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+ vsub.s32 q5, q3, q0
+ vsub.s32 q6, q4, q1
+
+ ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+ vadd.s32 q10, q3, q0
+ vadd.s32 q4, q4, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q5, #14 ; >> 14
+ vqrshrn.s32 d5, q6, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d10, q10, #14 ; >> 14
+ vqrshrn.s32 d11, q4, #14 ; >> 14
+
+ ; step1[11] * cospi_16_64
+ vmull.s16 q0, d22, d14
+ vmull.s16 q1, d23, d14
+
+ ; step1[12] * cospi_16_64
+ vmull.s16 q13, d24, d14
+ vmull.s16 q6, d25, d14
+
+ ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+ vsub.s32 q10, q13, q0
+ vsub.s32 q4, q6, q1
+
+ ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+ vadd.s32 q13, q13, q0
+ vadd.s32 q6, q6, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d6, q10, #14 ; >> 14
+ vqrshrn.s32 d7, q4, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d8, q13, #14 ; >> 14
+ vqrshrn.s32 d9, q6, #14 ; >> 14
+
+ mov r4, #16 ; pass1Output stride
+ ldr r3, [sp] ; load skip_adding
+ cmp r3, #0 ; check if need adding dest data
+ beq skip_adding_dest
+
+ ldr r7, [sp, #28] ; dest used to save element 0-7
+ mov r9, r7 ; save dest pointer for later use
+ ldr r8, [sp, #32] ; load dest_stride
+
+ ; stage 7
+ ; load the data in pass1
+ vld1.s16 {q0}, [r2], r4 ; load data step2[0]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[1]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[2]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[3]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
+ vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
+ vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
+ vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
+ vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
+ vld1.s16 {q0}, [r2], r4 ; load data step2[4]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[5]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[6]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[7]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
+ vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
+ vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
+ vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
+ vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
+ vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q12 ; clip pixel
+ vqmovun.s16 d13, q13 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
+ vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
+
+ ; store the data output 8,9,10,11,12,13,14,15
+ vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO
+ vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q8 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q9, q9, #6
+ vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q9 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q2, q2, #6
+ vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q2 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q3, q3, #6
+ vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q3 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q4, q4, #6
+ vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q4 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q5, q5, #6
+ vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q5 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ vld1.64 {d13}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q14, q14, #6
+ vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d12, q14 ; clip pixel
+ vst1.64 {d12}, [r9], r8 ; store the data
+ vld1.64 {d12}, [r7], r8 ; load destinatoin data
+ vrshr.s16 q15, q15, #6
+ vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i]
+ vqmovun.s16 d13, q15 ; clip pixel
+ vst1.64 {d13}, [r9], r8 ; store the data
+ b end_idct16x16_pass2
+
+skip_adding_dest
+ ; stage 7
+ ; load the data in pass1
+ mov r5, #24
+ mov r3, #8
+
+ vld1.s16 {q0}, [r2], r4 ; load data step2[0]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[1]
+ vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
+ vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[2]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[3]
+ vst1.64 {d24}, [r1], r3 ; store output[0]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[1]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
+ vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
+ vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
+ vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
+ vst1.64 {d24}, [r1], r3 ; store output[2]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[3]
+ vst1.64 {d27}, [r1], r5
+ vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
+ vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
+ vld1.s16 {q0}, [r2], r4 ; load data step2[4]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[5]
+ vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
+ vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[6]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[7]
+ vst1.64 {d24}, [r1], r3 ; store output[4]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[5]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
+ vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
+ vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
+ vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
+ vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
+ vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
+ vst1.64 {d24}, [r1], r3 ; store output[6]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[7]
+ vst1.64 {d27}, [r1], r5
+
+ ; store the data output 8,9,10,11,12,13,14,15
+ vst1.64 {d16}, [r1], r3
+ vst1.64 {d17}, [r1], r5
+ vst1.64 {d18}, [r1], r3
+ vst1.64 {d19}, [r1], r5
+ vst1.64 {d4}, [r1], r3
+ vst1.64 {d5}, [r1], r5
+ vst1.64 {d6}, [r1], r3
+ vst1.64 {d7}, [r1], r5
+ vst1.64 {d8}, [r1], r3
+ vst1.64 {d9}, [r1], r5
+ vst1.64 {d10}, [r1], r3
+ vst1.64 {d11}, [r1], r5
+ vst1.64 {d28}, [r1], r3
+ vst1.64 {d29}, [r1], r5
+ vst1.64 {d30}, [r1], r3
+ vst1.64 {d31}, [r1], r5
+end_idct16x16_pass2
+ pop {r3-r9}
+ bx lr
+ ENDP ; |vp9_idct16x16_256_add_neon_pass2|
+
+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
+; int16_t *output, int output_stride)
+;
+; r0 int16_t input
+; r1 int16_t *output
+; r2 int output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_idct16x16_10_add_neon_pass1| PROC
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q1,q2}, [r0]!
+ vmov.s16 q15, q1
+
+ ; generate cospi_28_64*2 = 6392
+ mov r3, #0x1800
+ add r3, #0xf8
+
+ ; generate cospi_4_64*2 = 32138
+ mov r12, #0x7d00
+ add r12, #0x8a
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 q0, r3 ; duplicate cospi_28_64*2
+ vdup.16 q1, r12 ; duplicate cospi_4_64*2
+
+ ; The following instructions use vqrdmulh to do the
+ ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
+ ; double, and return the high 16 bits, effectively giving >> 15. Doubling
+ ; the constant will change this to >> 14.
+ ; dct_const_round_shift(step2[4] * cospi_28_64);
+ vqrdmulh.s16 q4, q9, q0
+
+ ; preloading to avoid stall
+ ; generate cospi_16_64*2 = 23170
+ mov r3, #0x5a00
+ add r3, #0x82
+
+ ; dct_const_round_shift(step2[4] * cospi_4_64);
+ vqrdmulh.s16 q7, q9, q1
+
+ ; stage 4
+ vdup.16 q1, r3 ; cospi_16_64*2
+
+ ; generate cospi_16_64 = 11585
+ mov r3, #0x2d00
+ add r3, #0x41
+
+ vdup.16 d4, r3; ; duplicate cospi_16_64
+
+ ; dct_const_round_shift(step1[0] * cospi_16_64)
+ vqrdmulh.s16 q8, q8, q1
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d14, d4
+ vmull.s16 q10, d15, d4
+
+ ; step2[5] * cospi_16_64
+ vmull.s16 q12, d9, d4
+ vmull.s16 q11, d8, d4
+
+ ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+ vsub.s32 q15, q10, q12
+ vsub.s32 q6, q9, q11
+
+ ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+ vadd.s32 q9, q9, q11
+ vadd.s32 q10, q10, q12
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d11, q15, #14 ; >> 14
+ vqrshrn.s32 d10, q6, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q10, #14 ; >> 14
+
+ ; stage 6
+ vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7];
+ vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5];
+ vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4];
+ vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6];
+ vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4];
+ vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5];
+ vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6];
+ vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7];
+
+ ; store the data
+ vst1.64 {d4}, [r1], r2
+ vst1.64 {d5}, [r1], r2
+ vst1.64 {d18}, [r1], r2
+ vst1.64 {d19}, [r1], r2
+ vst1.64 {d20}, [r1], r2
+ vst1.64 {d21}, [r1], r2
+ vst1.64 {d22}, [r1], r2
+ vst1.64 {d23}, [r1], r2
+ vst1.64 {d24}, [r1], r2
+ vst1.64 {d25}, [r1], r2
+ vst1.64 {d26}, [r1], r2
+ vst1.64 {d27}, [r1], r2
+ vst1.64 {d28}, [r1], r2
+ vst1.64 {d29}, [r1], r2
+ vst1.64 {d30}, [r1], r2
+ vst1.64 {d31}, [r1], r2
+
+ bx lr
+ ENDP ; |vp9_idct16x16_10_add_neon_pass1|
+
+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
+; int16_t *output,
+; int16_t *pass1Output,
+; int16_t skip_adding,
+; uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t *src
+; r1 int16_t *output,
+; r2 int16_t *pass1Output,
+; r3 int16_t skip_adding,
+; r4 uint8_t *dest,
+; r5 int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_idct16x16_10_add_neon_pass2| PROC
+ push {r3-r9}
+
+ ; TODO(hkuang): Find a better way to load the elements.
+ ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+ vld2.s16 {q8,q9}, [r0]!
+ vld2.s16 {q9,q10}, [r0]!
+ vld2.s16 {q10,q11}, [r0]!
+ vld2.s16 {q11,q12}, [r0]!
+ vld2.s16 {q12,q13}, [r0]!
+ vld2.s16 {q13,q14}, [r0]!
+ vld2.s16 {q14,q15}, [r0]!
+ vld2.s16 {q0,q1}, [r0]!
+ vmov.s16 q15, q0;
+
+ ; generate 2*cospi_30_64 = 3212
+ mov r3, #0xc00
+ add r3, #0x8c
+
+ ; generate 2*cospi_2_64 = 32610
+ mov r12, #0x7f00
+ add r12, #0x62
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; stage 3
+ vdup.16 q6, r3 ; duplicate 2*cospi_30_64
+
+ ; dct_const_round_shift(step1[8] * cospi_30_64)
+ vqrdmulh.s16 q0, q8, q6
+
+ vdup.16 q6, r12 ; duplicate 2*cospi_2_64
+
+ ; dct_const_round_shift(step1[8] * cospi_2_64)
+ vqrdmulh.s16 q7, q8, q6
+
+ ; preloading to avoid stall
+ ; generate 2*cospi_26_64 = 9512
+ mov r12, #0x2500
+ add r12, #0x28
+ rsb r12, #0
+ vdup.16 q15, r12 ; duplicate -2*cospi_26_64
+
+ ; generate 2*cospi_6_64 = 31358
+ mov r3, #0x7a00
+ add r3, #0x7e
+ vdup.16 q14, r3 ; duplicate 2*cospi_6_64
+
+ ; dct_const_round_shift(- step1[12] * cospi_26_64)
+ vqrdmulh.s16 q3, q9, q15
+
+ ; dct_const_round_shift(step1[12] * cospi_6_64)
+ vqrdmulh.s16 q4, q9, q14
+
+ ; stage 4
+ ; generate cospi_24_64 = 6270
+ mov r3, #0x1800
+ add r3, #0x7e
+ vdup.16 d31, r3 ; duplicate cospi_24_64
+
+ ; generate cospi_8_64 = 15137
+ mov r12, #0x3b00
+ add r12, #0x21
+ vdup.16 d30, r12 ; duplicate cospi_8_64
+
+ ; step1[14] * cospi_24_64
+ vmull.s16 q12, d14, d31
+ vmull.s16 q5, d15, d31
+
+ ; step1[9] * cospi_24_64
+ vmull.s16 q2, d0, d31
+ vmull.s16 q11, d1, d31
+
+ ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+ vmlsl.s16 q12, d0, d30
+ vmlsl.s16 q5, d1, d30
+
+ ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+ vmlal.s16 q2, d14, d30
+ vmlal.s16 q11, d15, d30
+
+ rsb r12, #0
+ vdup.16 d30, r12 ; duplicate -cospi_8_64
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d2, q12, #14 ; >> 14
+ vqrshrn.s32 d3, q5, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d12, q2, #14 ; >> 14
+ vqrshrn.s32 d13, q11, #14 ; >> 14
+
+ ; - step1[13] * cospi_8_64
+ vmull.s16 q10, d8, d30
+ vmull.s16 q13, d9, d30
+
+ ; -step1[10] * cospi_8_64
+ vmull.s16 q8, d6, d30
+ vmull.s16 q9, d7, d30
+
+ ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
+ vmlsl.s16 q10, d6, d31
+ vmlsl.s16 q13, d7, d31
+
+ ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+ vmlal.s16 q8, d8, d31
+ vmlal.s16 q9, d9, d31
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q10, #14 ; >> 14
+ vqrshrn.s32 d5, q13, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d10, q8, #14 ; >> 14
+ vqrshrn.s32 d11, q9, #14 ; >> 14
+
+ ; stage 5
+ vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
+ vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
+ vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
+ vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
+ vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
+ vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
+ vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
+ vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
+
+ ; stage 6.
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ vdup.16 d14, r12 ; duplicate cospi_16_64
+
+ ; step1[13] * cospi_16_64
+ vmull.s16 q3, d26, d14
+ vmull.s16 q4, d27, d14
+
+ ; step1[10] * cospi_16_64
+ vmull.s16 q0, d20, d14
+ vmull.s16 q1, d21, d14
+
+ ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+ vsub.s32 q5, q3, q0
+ vsub.s32 q6, q4, q1
+
+ ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+ vadd.s32 q0, q3, q0
+ vadd.s32 q1, q4, q1
+
+ ; dct_const_round_shift(temp1)
+ vqrshrn.s32 d4, q5, #14 ; >> 14
+ vqrshrn.s32 d5, q6, #14 ; >> 14
+
+ ; dct_const_round_shift(temp2)
+ vqrshrn.s32 d10, q0, #14 ; >> 14
+ vqrshrn.s32 d11, q1, #14 ; >> 14
+
+ ; step1[11] * cospi_16_64
+ vmull.s16 q0, d22, d14
+ vmull.s16 q1, d23, d14
+
+ ; step1[12] * cospi_16_64
+ vmull.s16 q13, d24, d14
+ vmull.s16 q6, d25, d14
+
+ ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+ vsub.s32 q10, q13, q0
+ vsub.s32 q4, q6, q1
+
+ ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+ vadd.s32 q13, q13, q0
+ vadd.s32 q6, q6, q1
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d6, q10, #14 ; >> 14
+ vqrshrn.s32 d7, q4, #14 ; >> 14
+
+ ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
+ vqrshrn.s32 d8, q13, #14 ; >> 14
+ vqrshrn.s32 d9, q6, #14 ; >> 14
+
+ mov r4, #16 ; pass1Output stride
+ ldr r3, [sp] ; load skip_adding
+
+ ; stage 7
+ ; load the data in pass1
+ mov r5, #24
+ mov r3, #8
+
+ vld1.s16 {q0}, [r2], r4 ; load data step2[0]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[1]
+ vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
+ vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[2]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[3]
+ vst1.64 {d24}, [r1], r3 ; store output[0]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[1]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
+ vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
+ vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
+ vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
+ vst1.64 {d24}, [r1], r3 ; store output[2]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[3]
+ vst1.64 {d27}, [r1], r5
+ vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
+ vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
+ vld1.s16 {q0}, [r2], r4 ; load data step2[4]
+ vld1.s16 {q1}, [r2], r4 ; load data step2[5]
+ vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
+ vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
+ vld1.s16 {q10}, [r2], r4 ; load data step2[6]
+ vld1.s16 {q11}, [r2], r4 ; load data step2[7]
+ vst1.64 {d24}, [r1], r3 ; store output[4]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[5]
+ vst1.64 {d27}, [r1], r5
+ vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
+ vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
+ vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
+ vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
+ vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
+ vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
+ vst1.64 {d24}, [r1], r3 ; store output[6]
+ vst1.64 {d25}, [r1], r5
+ vst1.64 {d26}, [r1], r3 ; store output[7]
+ vst1.64 {d27}, [r1], r5
+
+ ; store the data output 8,9,10,11,12,13,14,15
+ vst1.64 {d16}, [r1], r3
+ vst1.64 {d17}, [r1], r5
+ vst1.64 {d18}, [r1], r3
+ vst1.64 {d19}, [r1], r5
+ vst1.64 {d4}, [r1], r3
+ vst1.64 {d5}, [r1], r5
+ vst1.64 {d6}, [r1], r3
+ vst1.64 {d7}, [r1], r5
+ vst1.64 {d8}, [r1], r3
+ vst1.64 {d9}, [r1], r5
+ vst1.64 {d10}, [r1], r3
+ vst1.64 {d11}, [r1], r5
+ vst1.64 {d28}, [r1], r3
+ vst1.64 {d29}, [r1], r5
+ vst1.64 {d30}, [r1], r3
+ vst1.64 {d31}, [r1], r5
+end_idct10_16x16_pass2
+ pop {r3-r9}
+ bx lr
+ ENDP ; |vp9_idct16x16_10_add_neon_pass2|
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
@@ -1,0 +1,144 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp9_idct32x32_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ;TODO(hkuang): put the following macros in a seperate
+ ;file so other idct function could also use them.
+ MACRO
+ LD_16x8 $src, $stride
+ vld1.8 {q8}, [$src], $stride
+ vld1.8 {q9}, [$src], $stride
+ vld1.8 {q10}, [$src], $stride
+ vld1.8 {q11}, [$src], $stride
+ vld1.8 {q12}, [$src], $stride
+ vld1.8 {q13}, [$src], $stride
+ vld1.8 {q14}, [$src], $stride
+ vld1.8 {q15}, [$src], $stride
+ MEND
+
+ MACRO
+ ADD_DIFF_16x8 $diff
+ vqadd.u8 q8, q8, $diff
+ vqadd.u8 q9, q9, $diff
+ vqadd.u8 q10, q10, $diff
+ vqadd.u8 q11, q11, $diff
+ vqadd.u8 q12, q12, $diff
+ vqadd.u8 q13, q13, $diff
+ vqadd.u8 q14, q14, $diff
+ vqadd.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ SUB_DIFF_16x8 $diff
+ vqsub.u8 q8, q8, $diff
+ vqsub.u8 q9, q9, $diff
+ vqsub.u8 q10, q10, $diff
+ vqsub.u8 q11, q11, $diff
+ vqsub.u8 q12, q12, $diff
+ vqsub.u8 q13, q13, $diff
+ vqsub.u8 q14, q14, $diff
+ vqsub.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ ST_16x8 $dst, $stride
+ vst1.8 {q8}, [$dst], $stride
+ vst1.8 {q9}, [$dst], $stride
+ vst1.8 {q10},[$dst], $stride
+ vst1.8 {q11},[$dst], $stride
+ vst1.8 {q12},[$dst], $stride
+ vst1.8 {q13},[$dst], $stride
+ vst1.8 {q14},[$dst], $stride
+ vst1.8 {q15},[$dst], $stride
+ MEND
+
+;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride
+
+|vp9_idct32x32_1_add_neon| PROC
+ push {lr}
+ pld [r1]
+ add r3, r1, #16 ; r3 dest + 16 for second loop
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 6)
+ add r0, r0, #32 ; + (1 <<((6) - 1))
+ asrs r0, r0, #6 ; >> 6
+ bge diff_positive_32_32
+
+diff_negative_32_32
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+diff_negative_32_32_loop
+ sub r0, #1
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r12, r2
+
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r12, r2
+ cmp r0, #2
+ moveq r1, r3
+ moveq r12, r3
+ cmp r0, #0
+ bne diff_negative_32_32_loop
+ pop {pc}
+
+diff_positive_32_32
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+diff_positive_32_32_loop
+ sub r0, #1
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r12, r2
+
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r12, r2
+ cmp r0, #2
+ moveq r1, r3
+ moveq r12, r3
+ cmp r0, #0
+ bne diff_positive_32_32_loop
+ pop {pc}
+
+ ENDP ; |vp9_idct32x32_1_add_neon|
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
@@ -1,0 +1,1299 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+; dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64 EQU 16364
+cospi_2_64 EQU 16305
+cospi_3_64 EQU 16207
+cospi_4_64 EQU 16069
+cospi_5_64 EQU 15893
+cospi_6_64 EQU 15679
+cospi_7_64 EQU 15426
+cospi_8_64 EQU 15137
+cospi_9_64 EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU 9760
+cospi_20_64 EQU 9102
+cospi_21_64 EQU 8423
+cospi_22_64 EQU 7723
+cospi_23_64 EQU 7005
+cospi_24_64 EQU 6270
+cospi_25_64 EQU 5520
+cospi_26_64 EQU 4756
+cospi_27_64 EQU 3981
+cospi_28_64 EQU 3196
+cospi_29_64 EQU 2404
+cospi_30_64 EQU 1606
+cospi_31_64 EQU 804
+
+
+ EXPORT |vp9_idct32x32_1024_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ AREA Block, CODE, READONLY
+
+ ; --------------------------------------------------------------------------
+ ; Load from transposed_buffer
+ ; q13 = transposed_buffer[first_offset]
+ ; q14 = transposed_buffer[second_offset]
+ ; for proper address calculation, the last offset used when manipulating
+ ; transposed_buffer must be passed in. use 0 for first use.
+ MACRO
+ LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+ ; address calculation with proper stride and loading
+ add r0, #($first_offset - $prev_offset )*8*2
+ vld1.s16 {q14}, [r0]
+ add r0, #($second_offset - $first_offset)*8*2
+ vld1.s16 {q13}, [r0]
+ ; (used) two registers (q14, q13)
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Load from output (used as temporary storage)
+ ; reg1 = output[first_offset]
+ ; reg2 = output[second_offset]
+ ; for proper address calculation, the last offset used when manipulating
+ ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; use.
+ MACRO
+ LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+ ; address calculation with proper stride and loading
+ add r1, #($first_offset - $prev_offset )*32*2
+ vld1.s16 {$reg1}, [r1]
+ add r1, #($second_offset - $first_offset)*32*2
+ vld1.s16 {$reg2}, [r1]
+ ; (used) two registers ($reg1, $reg2)
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Store into output (sometimes as as temporary storage)
+ ; output[first_offset] = reg1
+ ; output[second_offset] = reg2
+ ; for proper address calculation, the last offset used when manipulating
+ ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; use.
+ MACRO
+ STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+ ; address calculation with proper stride and storing
+ add r1, #($first_offset - $prev_offset )*32*2
+ vst1.16 {$reg1}, [r1]
+ add r1, #($second_offset - $first_offset)*32*2
+ vst1.16 {$reg2}, [r1]
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q6-q9 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_CENTER_RESULTS
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d8}, [r10], r2
+ vld1.s16 {d11}, [r9], r11
+ vld1.s16 {d9}, [r10]
+ vld1.s16 {d10}, [r9]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q6, q6, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q7, q7, d9
+ vaddw.u8 q8, q8, d10
+ vaddw.u8 q9, q9, d11
+ vaddw.u8 q6, q6, d8
+ ; clip pixel
+ vqmovun.s16 d9, q7
+ vqmovun.s16 d10, q8
+ vqmovun.s16 d11, q9
+ vqmovun.s16 d8, q6
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d9}, [r10], r11
+ vst1.16 {d10}, [r9], r2
+ vst1.16 {d8}, [r10]
+ vst1.16 {d11}, [r9]
+ ; update pointers (by dest_stride * 2)
+ sub r9, r9, r2, lsl #1
+ add r10, r10, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q6-q9 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_CENTER_RESULTS_LAST
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d8}, [r10], r2
+ vld1.s16 {d11}, [r9], r11
+ vld1.s16 {d9}, [r10]
+ vld1.s16 {d10}, [r9]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q6, q6, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q7, q7, d9
+ vaddw.u8 q8, q8, d10
+ vaddw.u8 q9, q9, d11
+ vaddw.u8 q6, q6, d8
+ ; clip pixel
+ vqmovun.s16 d9, q7
+ vqmovun.s16 d10, q8
+ vqmovun.s16 d11, q9
+ vqmovun.s16 d8, q6
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d9}, [r10], r11
+ vst1.16 {d10}, [r9], r2
+ vst1.16 {d8}, [r10]!
+ vst1.16 {d11}, [r9]!
+ ; update pointers (by dest_stride * 2)
+ sub r9, r9, r2, lsl #1
+ add r10, r10, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q4-q7 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_EXTREME_RESULTS
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d4}, [r7], r2
+ vld1.s16 {d7}, [r6], r11
+ vld1.s16 {d5}, [r7]
+ vld1.s16 {d6}, [r6]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q5, q5, #6
+ vrshr.s16 q6, q6, #6
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q4, q4, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q5, q5, d5
+ vaddw.u8 q6, q6, d6
+ vaddw.u8 q7, q7, d7
+ vaddw.u8 q4, q4, d4
+ ; clip pixel
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ vqmovun.s16 d4, q4
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d5}, [r7], r11
+ vst1.16 {d6}, [r6], r2
+ vst1.16 {d7}, [r6]
+ vst1.16 {d4}, [r7]
+ ; update pointers (by dest_stride * 2)
+ sub r6, r6, r2, lsl #1
+ add r7, r7, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q4-q7 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_EXTREME_RESULTS_LAST
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d4}, [r7], r2
+ vld1.s16 {d7}, [r6], r11
+ vld1.s16 {d5}, [r7]
+ vld1.s16 {d6}, [r6]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q5, q5, #6
+ vrshr.s16 q6, q6, #6
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q4, q4, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q5, q5, d5
+ vaddw.u8 q6, q6, d6
+ vaddw.u8 q7, q7, d7
+ vaddw.u8 q4, q4, d4
+ ; clip pixel
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ vqmovun.s16 d4, q4
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d5}, [r7], r11
+ vst1.16 {d6}, [r6], r2
+ vst1.16 {d7}, [r6]!
+ vst1.16 {d4}, [r7]!
+ ; update pointers (by dest_stride * 2)
+ sub r6, r6, r2, lsl #1
+ add r7, r7, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Touches q8-q12, q15 (q13-q14 are preserved)
+ ; valid output registers are anything but q8-q11
+ MACRO
+ DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ ; TODO(cd): have special case to re-use constants when they are similar for
+ ; consecutive butterflies
+ ; TODO(cd): have special case when both constants are the same, do the
+ ; additions/substractions before the multiplies.
+ ; generate the constants
+ ; generate scalar constants
+ mov r8, #$first_constant & 0xFF00
+ mov r12, #$second_constant & 0xFF00
+ add r8, #$first_constant & 0x00FF
+ add r12, #$second_constant & 0x00FF
+ ; generate vector constants
+ vdup.16 d30, r8
+ vdup.16 d31, r12
+ ; (used) two for inputs (regA-regD), one for constants (q15)
+ ; do some multiplications (ordered for maximum latency hiding)
+ vmull.s16 q8, $regC, d30
+ vmull.s16 q10, $regA, d31
+ vmull.s16 q9, $regD, d30
+ vmull.s16 q11, $regB, d31
+ vmull.s16 q12, $regC, d31
+ ; (used) five for intermediate (q8-q12), one for constants (q15)
+ ; do some addition/substractions (to get back two register)
+ vsub.s32 q8, q8, q10
+ vsub.s32 q9, q9, q11
+ ; do more multiplications (ordered for maximum latency hiding)
+ vmull.s16 q10, $regD, d31
+ vmull.s16 q11, $regA, d30
+ vmull.s16 q15, $regB, d30
+ ; (used) six for intermediate (q8-q12, q15)
+ ; do more addition/substractions
+ vadd.s32 q11, q12, q11
+ vadd.s32 q10, q10, q15
+ ; (used) four for intermediate (q8-q11)
+ ; dct_const_round_shift
+ vqrshrn.s32 $reg1, q8, #14
+ vqrshrn.s32 $reg2, q9, #14
+ vqrshrn.s32 $reg3, q11, #14
+ vqrshrn.s32 $reg4, q10, #14
+ ; (used) two for results, well four d registers
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Touches q8-q12, q15 (q13-q14 are preserved)
+ ; valid output registers are anything but q8-q11
+ MACRO
+ DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ MEND
+ ; --------------------------------------------------------------------------
+
+;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;
+; r0 int16_t *input,
+; r1 uint8_t *dest,
+; r2 int dest_stride)
+; loop counters
+; r4 bands loop counter
+; r5 pass loop counter
+; r8 transpose loop counter
+; combine-add pointers
+; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
+; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
+; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
+; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
+
+|vp9_idct32x32_1024_add_neon| PROC
+ ; This function does one pass of idct32x32 transform.
+ ;
+ ; This is done by transposing the input and then doing a 1d transform on
+ ; columns. In the first pass, the transposed columns are the original
+ ; rows. In the second pass, after the transposition, the colums are the
+ ; original columns.
+ ; The 1d transform is done by looping over bands of eight columns (the
+ ; idct32_bands loop). For each band, the transform input transposition
+ ; is done on demand, one band of four 8x8 matrices at a time. The four
+ ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+ push {r4-r11}
+ vpush {d8-d15}
+ ; stack operation
+ ; internal buffer used to transpose 8 lines into before transforming them
+ ; int16_t transpose_buffer[32 * 8];
+ ; at sp + [4096, 4607]
+ ; results of the first pass (transpose and transform rows)
+ ; int16_t pass1[32 * 32];
+ ; at sp + [0, 2047]
+ ; results of the second pass (transpose and transform columns)
+ ; int16_t pass2[32 * 32];
+ ; at sp + [2048, 4095]
+ sub sp, sp, #512+2048+2048
+
+ ; r6 = dest + 31 * dest_stride
+ ; r7 = dest + 0 * dest_stride
+ ; r9 = dest + 15 * dest_stride
+ ; r10 = dest + 16 * dest_stride
+ rsb r6, r2, r2, lsl #5
+ rsb r9, r2, r2, lsl #4
+ add r10, r1, r2, lsl #4
+ mov r7, r1
+ add r6, r6, r1
+ add r9, r9, r1
+ ; r11 = -dest_stride
+ neg r11, r2
+ ; r3 = input
+ mov r3, r0
+ ; parameters for first pass
+ ; r0 = transpose_buffer[32 * 8]
+ add r0, sp, #4096
+ ; r1 = pass1[32 * 32]
+ mov r1, sp
+
+ mov r5, #0 ; initialize pass loop counter
+idct32_pass_loop
+ mov r4, #4 ; initialize bands loop counter
+idct32_bands_loop
+ mov r8, #2 ; initialize transpose loop counter
+idct32_transpose_pair_loop
+ ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+ ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+ ; adjusted to 32 because of the two post-increments.
+ vld1.s16 {q8}, [r3]!
+ vld1.s16 {q0}, [r3]!
+ add r3, #32
+ vld1.s16 {q9}, [r3]!
+ vld1.s16 {q1}, [r3]!
+ add r3, #32
+ vld1.s16 {q10}, [r3]!
+ vld1.s16 {q2}, [r3]!
+ add r3, #32
+ vld1.s16 {q11}, [r3]!
+ vld1.s16 {q3}, [r3]!
+ add r3, #32
+ vld1.s16 {q12}, [r3]!
+ vld1.s16 {q4}, [r3]!
+ add r3, #32
+ vld1.s16 {q13}, [r3]!
+ vld1.s16 {q5}, [r3]!
+ add r3, #32
+ vld1.s16 {q14}, [r3]!
+ vld1.s16 {q6}, [r3]!
+ add r3, #32
+ vld1.s16 {q15}, [r3]!
+ vld1.s16 {q7}, [r3]!
+
+ ; Transpose the two 8x8 16bit data matrices.
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vswp d1, d8
+ vswp d7, d14
+ vswp d5, d12
+ vswp d3, d10
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; Store both matrices after each other. There is a stride of 32, which
+ ; adjusts to nothing because of the post-increments.
+ vst1.16 {q8}, [r0]!
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+ vst1.16 {q0}, [r0]!
+ vst1.16 {q1}, [r0]!
+ vst1.16 {q2}, [r0]!
+ vst1.16 {q3}, [r0]!
+ vst1.16 {q4}, [r0]!
+ vst1.16 {q5}, [r0]!
+ vst1.16 {q6}, [r0]!
+ vst1.16 {q7}, [r0]!
+
+ ; increment pointers by adjusted stride (not necessary for r0/out)
+ ; go back by 7*32 for the seven lines moved fully by read and add
+ ; go back by 32 for the eigth line only read
+ ; advance by 16*2 to go the next pair
+ sub r3, r3, #7*32*2 + 32 - 16*2
+ ; transpose pair loop processing
+ subs r8, r8, #1
+ bne idct32_transpose_pair_loop
+
+ ; restore r0/input to its original value
+ sub r0, r0, #32*8*2
+
+ ; Instead of doing the transforms stage by stage, it is done by loading
+ ; some input values and doing as many stages as possible to minimize the
+ ; storing/loading of intermediate results. To fit within registers, the
+ ; final coefficients are cut into four blocks:
+ ; BLOCK A: 16-19,28-31
+ ; BLOCK B: 20-23,24-27
+ ; BLOCK C: 8-10,11-15
+ ; BLOCK D: 0-3,4-7
+ ; Blocks A and C are straight calculation through the various stages. In
+ ; block B, further calculations are performed using the results from
+ ; block A. In block D, further calculations are performed using the results
+ ; from block C and then the final calculations are done using results from
+ ; block A and B which have been combined at the end of block B.
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK A: 16-19,28-31
+ ; --------------------------------------------------------------------------
+ ; generate 16,17,30,31
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
+ ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
+ ;step1b[16][i] = dct_const_round_shift(temp1);
+ ;step1b[31][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 0, 1, 31
+ DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+ ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+ ;step1b[17][i] = dct_const_round_shift(temp1);
+ ;step1b[30][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 31, 17, 15
+ DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[16] = step1b[16][i] + step1b[17][i];
+ ;step2[17] = step1b[16][i] - step1b[17][i];
+ ;step2[30] = -step1b[30][i] + step1b[31][i];
+ ;step2[31] = step1b[30][i] + step1b[31][i];
+ vadd.s16 q4, q0, q1
+ vsub.s16 q13, q0, q1
+ vadd.s16 q6, q2, q3
+ vsub.s16 q14, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+ ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
+ ;step3[17] = dct_const_round_shift(temp1);
+ ;step3[30] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; generate 18,19,28,29
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+ ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
+ ;step1b[18][i] = dct_const_round_shift(temp1);
+ ;step1b[29][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 15, 9, 23
+ DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
+ ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+ ;step1b[19][i] = dct_const_round_shift(temp1);
+ ;step1b[28][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 23, 25, 7
+ DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[18] = -step1b[18][i] + step1b[19][i];
+ ;step2[19] = step1b[18][i] + step1b[19][i];
+ ;step2[28] = step1b[28][i] + step1b[29][i];
+ ;step2[29] = step1b[28][i] - step1b[29][i];
+ vsub.s16 q13, q3, q2
+ vadd.s16 q3, q3, q2
+ vsub.s16 q14, q1, q0
+ vadd.s16 q2, q1, q0
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
+ ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+ ;step3[29] = dct_const_round_shift(temp1);
+ ;step3[18] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+ ; --------------------------------------------------------------------------
+ ; combine 16-19,28-31
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[16] = step1b[16][i] + step1b[19][i];
+ ;step1[17] = step1b[17][i] + step1b[18][i];
+ ;step1[18] = step1b[17][i] - step1b[18][i];
+ ;step1[29] = step1b[30][i] - step1b[29][i];
+ ;step1[30] = step1b[30][i] + step1b[29][i];
+ ;step1[31] = step1b[31][i] + step1b[28][i];
+ vadd.s16 q8, q4, q2
+ vadd.s16 q9, q5, q0
+ vadd.s16 q10, q7, q1
+ vadd.s16 q15, q6, q3
+ vsub.s16 q13, q5, q0
+ vsub.s16 q14, q7, q1
+ STORE_IN_OUTPUT 0, 16, 31, q8, q15
+ STORE_IN_OUTPUT 31, 17, 30, q9, q10
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+ ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
+ ;step2[18] = dct_const_round_shift(temp1);
+ ;step2[29] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+ STORE_IN_OUTPUT 30, 29, 18, q1, q0
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[19] = step1b[16][i] - step1b[19][i];
+ ;step1[28] = step1b[31][i] - step1b[28][i];
+ vsub.s16 q13, q4, q2
+ vsub.s16 q14, q6, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+ ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
+ ;step2[19] = dct_const_round_shift(temp1);
+ ;step2[28] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+ STORE_IN_OUTPUT 18, 19, 28, q4, q6
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK B: 20-23,24-27
+ ; --------------------------------------------------------------------------
+ ; generate 20,21,26,27
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+ ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
+ ;step1b[20][i] = dct_const_round_shift(temp1);
+ ;step1b[27][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 7, 5, 27
+ DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+ ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+ ;step1b[21][i] = dct_const_round_shift(temp1);
+ ;step1b[26][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 27, 21, 11
+ DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[20] = step1b[20][i] + step1b[21][i];
+ ;step2[21] = step1b[20][i] - step1b[21][i];
+ ;step2[26] = -step1b[26][i] + step1b[27][i];
+ ;step2[27] = step1b[26][i] + step1b[27][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+ ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+ ;step3[21] = dct_const_round_shift(temp1);
+ ;step3[26] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 22,23,24,25
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+ ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+ ;step1b[22][i] = dct_const_round_shift(temp1);
+ ;step1b[25][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 11, 13, 19
+ DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
+ ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+ ;step1b[23][i] = dct_const_round_shift(temp1);
+ ;step1b[24][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 19, 29, 3
+ DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[22] = -step1b[22][i] + step1b[23][i];
+ ;step2[23] = step1b[22][i] + step1b[23][i];
+ ;step2[24] = step1b[24][i] + step1b[25][i];
+ ;step2[25] = step1b[24][i] - step1b[25][i];
+ vsub.s16 q14, q4, q5
+ vadd.s16 q5, q4, q5
+ vsub.s16 q13, q6, q7
+ vadd.s16 q6, q6, q7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+ ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+ ;step3[25] = dct_const_round_shift(temp1);
+ ;step3[22] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+ ; --------------------------------------------------------------------------
+ ; combine 20-23,24-27
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[22] = step1b[22][i] + step1b[21][i];
+ ;step1[23] = step1b[23][i] + step1b[20][i];
+ vadd.s16 q10, q7, q1
+ vadd.s16 q11, q5, q0
+ ;step1[24] = step1b[24][i] + step1b[27][i];
+ ;step1[25] = step1b[25][i] + step1b[26][i];
+ vadd.s16 q12, q6, q2
+ vadd.s16 q15, q4, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[16] = step1b[16][i] + step1b[23][i];
+ ;step3[17] = step1b[17][i] + step1b[22][i];
+ ;step3[22] = step1b[17][i] - step1b[22][i];
+ ;step3[23] = step1b[16][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+ vadd.s16 q8, q14, q11
+ vadd.s16 q9, q13, q10
+ vsub.s16 q13, q13, q10
+ vsub.s16 q11, q14, q11
+ STORE_IN_OUTPUT 17, 17, 16, q9, q8
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[24] = step1b[31][i] - step1b[24][i];
+ ;step3[25] = step1b[30][i] - step1b[25][i];
+ ;step3[30] = step1b[30][i] + step1b[25][i];
+ ;step3[31] = step1b[31][i] + step1b[24][i];
+ LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+ vsub.s16 q8, q9, q12
+ vadd.s16 q10, q14, q15
+ vsub.s16 q14, q14, q15
+ vadd.s16 q12, q9, q12
+ STORE_IN_OUTPUT 31, 30, 31, q10, q12
+ ; --------------------------------------------------------------------------
+ ; TODO(cd) do some register allocation change to remove these push/pop
+ vpush {q8} ; [24]
+ vpush {q11} ; [23]
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+ ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+ ;step1[22] = dct_const_round_shift(temp1);
+ ;step1[25] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 31, 25, 22, q14, q13
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+ ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+ ;step1[23] = dct_const_round_shift(temp1);
+ ;step1[24] = dct_const_round_shift(temp2);
+ ; TODO(cd) do some register allocation change to remove these push/pop
+ vpop {q13} ; [23]
+ vpop {q14} ; [24]
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 22, 24, 23, q14, q13
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[20] = step1b[23][i] - step1b[20][i];
+ ;step1[27] = step1b[24][i] - step1b[27][i];
+ vsub.s16 q14, q5, q0
+ vsub.s16 q13, q6, q2
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
+ ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+ ;step2[27] = dct_const_round_shift(temp1);
+ ;step2[20] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[21] = step1b[22][i] - step1b[21][i];
+ ;step1[26] = step1b[25][i] - step1b[26][i];
+ vsub.s16 q14, q7, q1
+ vsub.s16 q13, q4, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
+ ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+ ;step2[26] = dct_const_round_shift(temp1);
+ ;step2[21] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[18] = step1b[18][i] + step1b[21][i];
+ ;step3[19] = step1b[19][i] + step1b[20][i];
+ ;step3[20] = step1b[19][i] - step1b[20][i];
+ ;step3[21] = step1b[18][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+ vadd.s16 q8, q14, q1
+ vadd.s16 q9, q13, q6
+ vsub.s16 q13, q13, q6
+ vsub.s16 q1, q14, q1
+ STORE_IN_OUTPUT 19, 18, 19, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[27] = step1b[28][i] - step1b[27][i];
+ ;step3[28] = step1b[28][i] + step1b[27][i];
+ ;step3[29] = step1b[29][i] + step1b[26][i];
+ ;step3[26] = step1b[29][i] - step1b[26][i];
+ LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+ vsub.s16 q14, q8, q5
+ vadd.s16 q10, q8, q5
+ vadd.s16 q11, q9, q0
+ vsub.s16 q0, q9, q0
+ STORE_IN_OUTPUT 29, 28, 29, q10, q11
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+ ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+ ;step1[20] = dct_const_round_shift(temp1);
+ ;step1[27] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 29, 20, 27, q13, q14
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+ ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+ ;step1[21] = dct_const_round_shift(temp1);
+ ;step1[26] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+ STORE_IN_OUTPUT 27, 21, 26, q1, q0
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK C: 8-10,11-15
+ ; --------------------------------------------------------------------------
+ ; generate 8,9,14,15
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+ ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+ ;step2[8] = dct_const_round_shift(temp1);
+ ;step2[15] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 3, 2, 30
+ DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+ ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+ ;step2[9] = dct_const_round_shift(temp1);
+ ;step2[14] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 30, 18, 14
+ DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;step3[8] = step1b[8][i] + step1b[9][i];
+ ;step3[9] = step1b[8][i] - step1b[9][i];
+ ;step3[14] = step1b[15][i] - step1b[14][i];
+ ;step3[15] = step1b[15][i] + step1b[14][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+ ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
+ ;step1[9] = dct_const_round_shift(temp1);
+ ;step1[14] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 10,11,12,13
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+ ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+ ;step2[10] = dct_const_round_shift(temp1);
+ ;step2[13] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 14, 10, 22
+ DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+ ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+ ;step2[11] = dct_const_round_shift(temp1);
+ ;step2[12] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 22, 26, 6
+ DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;step3[10] = step1b[11][i] - step1b[10][i];
+ ;step3[11] = step1b[11][i] + step1b[10][i];
+ ;step3[12] = step1b[12][i] + step1b[13][i];
+ ;step3[13] = step1b[12][i] - step1b[13][i];
+ vsub.s16 q14, q4, q5
+ vadd.s16 q5, q4, q5
+ vsub.s16 q13, q6, q7
+ vadd.s16 q6, q6, q7
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
+ ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+ ;step1[13] = dct_const_round_shift(temp1);
+ ;step1[10] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+ ; --------------------------------------------------------------------------
+ ; combine 8-10,11-15
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[8] = step1b[8][i] + step1b[11][i];
+ ;step2[9] = step1b[9][i] + step1b[10][i];
+ ;step2[10] = step1b[9][i] - step1b[10][i];
+ vadd.s16 q8, q0, q5
+ vadd.s16 q9, q1, q7
+ vsub.s16 q13, q1, q7
+ ;step2[13] = step1b[14][i] - step1b[13][i];
+ ;step2[14] = step1b[14][i] + step1b[13][i];
+ ;step2[15] = step1b[15][i] + step1b[12][i];
+ vsub.s16 q14, q3, q4
+ vadd.s16 q10, q3, q4
+ vadd.s16 q15, q2, q6
+ STORE_IN_OUTPUT 26, 8, 15, q8, q15
+ STORE_IN_OUTPUT 15, 9, 14, q9, q10
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+ ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+ ;step3[10] = dct_const_round_shift(temp1);
+ ;step3[13] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ STORE_IN_OUTPUT 14, 13, 10, q3, q1
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[11] = step1b[8][i] - step1b[11][i];
+ ;step2[12] = step1b[15][i] - step1b[12][i];
+ vsub.s16 q13, q0, q5
+ vsub.s16 q14, q2, q6
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+ ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+ ;step3[11] = dct_const_round_shift(temp1);
+ ;step3[12] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ STORE_IN_OUTPUT 10, 11, 12, q1, q3
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK D: 0-3,4-7
+ ; --------------------------------------------------------------------------
+ ; generate 4,5,6,7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+ ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+ ;step3[4] = dct_const_round_shift(temp1);
+ ;step3[7] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 6, 4, 28
+ DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+ ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+ ;step3[5] = dct_const_round_shift(temp1);
+ ;step3[6] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 28, 20, 12
+ DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[4] = step1b[4][i] + step1b[5][i];
+ ;step1[5] = step1b[4][i] - step1b[5][i];
+ ;step1[6] = step1b[7][i] - step1b[6][i];
+ ;step1[7] = step1b[7][i] + step1b[6][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+ ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+ ;step2[5] = dct_const_round_shift(temp1);
+ ;step2[6] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 0,1,2,3
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+ ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+ ;step1[1] = dct_const_round_shift(temp1);
+ ;step1[0] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 12, 0, 16
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+ ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+ ;step1[2] = dct_const_round_shift(temp1);
+ ;step1[3] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 16, 8, 24
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[0] = step1b[0][i] + step1b[3][i];
+ ;step2[1] = step1b[1][i] + step1b[2][i];
+ ;step2[2] = step1b[1][i] - step1b[2][i];
+ ;step2[3] = step1b[0][i] - step1b[3][i];
+ vadd.s16 q4, q7, q6
+ vsub.s16 q7, q7, q6
+ vsub.s16 q6, q5, q14
+ vadd.s16 q5, q5, q14
+ ; --------------------------------------------------------------------------
+ ; combine 0-3,4-7
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[0] = step1b[0][i] + step1b[7][i];
+ ;step3[1] = step1b[1][i] + step1b[6][i];
+ ;step3[2] = step1b[2][i] + step1b[5][i];
+ ;step3[3] = step1b[3][i] + step1b[4][i];
+ vadd.s16 q8, q4, q2
+ vadd.s16 q9, q5, q3
+ vadd.s16 q10, q6, q1
+ vadd.s16 q11, q7, q0
+ ;step3[4] = step1b[3][i] - step1b[4][i];
+ ;step3[5] = step1b[2][i] - step1b[5][i];
+ ;step3[6] = step1b[1][i] - step1b[6][i];
+ ;step3[7] = step1b[0][i] - step1b[7][i];
+ vsub.s16 q12, q7, q0
+ vsub.s16 q13, q6, q1
+ vsub.s16 q14, q5, q3
+ vsub.s16 q15, q4, q2
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[0] = step1b[0][i] + step1b[15][i];
+ ;step1[1] = step1b[1][i] + step1b[14][i];
+ ;step1[14] = step1b[1][i] - step1b[14][i];
+ ;step1[15] = step1b[0][i] - step1b[15][i];
+ LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+ vadd.s16 q2, q8, q1
+ vadd.s16 q3, q9, q0
+ vsub.s16 q4, q9, q0
+ vsub.s16 q5, q8, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+ ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+ ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+ ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+ LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+
+ cmp r5, #0
+ bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+ STORE_IN_OUTPUT 17, 16, 17, q6, q7
+ STORE_IN_OUTPUT 17, 14, 15, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+ ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+ ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+ ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+ LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 31, 30, 31, q6, q7
+ STORE_IN_OUTPUT 31, 0, 1, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[2] = step1b[2][i] + step1b[13][i];
+ ;step1[3] = step1b[3][i] + step1b[12][i];
+ ;step1[12] = step1b[3][i] - step1b[12][i];
+ ;step1[13] = step1b[2][i] - step1b[13][i];
+ LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+ vadd.s16 q2, q10, q1
+ vadd.s16 q3, q11, q0
+ vsub.s16 q4, q11, q0
+ vsub.s16 q5, q10, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+ ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+ ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+ ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+ LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 19, 18, 19, q6, q7
+ STORE_IN_OUTPUT 19, 12, 13, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+ ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+ ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+ ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+ LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 29, 28, 29, q6, q7
+ STORE_IN_OUTPUT 29, 2, 3, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[4] = step1b[4][i] + step1b[11][i];
+ ;step1[5] = step1b[5][i] + step1b[10][i];
+ ;step1[10] = step1b[5][i] - step1b[10][i];
+ ;step1[11] = step1b[4][i] - step1b[11][i];
+ LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+ vadd.s16 q2, q12, q1
+ vadd.s16 q3, q13, q0
+ vsub.s16 q4, q13, q0
+ vsub.s16 q5, q12, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+ ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+ ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+ ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 21, 20, 21, q6, q7
+ STORE_IN_OUTPUT 21, 10, 11, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+ ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+ ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+ ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+ LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 27, 26, 27, q6, q7
+ STORE_IN_OUTPUT 27, 4, 5, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[6] = step1b[6][i] + step1b[9][i];
+ ;step1[7] = step1b[7][i] + step1b[8][i];
+ ;step1[8] = step1b[7][i] - step1b[8][i];
+ ;step1[9] = step1b[6][i] - step1b[9][i];
+ LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+ vadd.s16 q2, q14, q1
+ vadd.s16 q3, q15, q0
+ vsub.s16 q4, q15, q0
+ vsub.s16 q5, q14, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+ ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+ ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+ ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 23, 22, 23, q6, q7
+ STORE_IN_OUTPUT 23, 8, 9, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+ ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+ ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+ ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+ LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 25, 24, 25, q6, q7
+ STORE_IN_OUTPUT 25, 6, 7, q4, q5
+
+ ; restore r0 by removing the last offset from the last
+ ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+ sub r0, r0, #24*8*2
+ ; restore r1 by removing the last offset from the last
+ ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2
+ ; advance by 8 columns => 8*2
+ sub r1, r1, #7*32*2 - 8*2
+ ; advance by 8 lines (8*32*2)
+ ; go back by the two pairs from the loop (32*2)
+ add r3, r3, #8*32*2 - 32*2
+
+ ; bands loop processing
+ subs r4, r4, #1
+ bne idct32_bands_loop
+
+ ; parameters for second pass
+ ; the input of pass2 is the result of pass1. we have to remove the offset
+ ; of 32 columns induced by the above idct32_bands_loop
+ sub r3, r1, #32*2
+ ; r1 = pass2[32 * 32]
+ add r1, sp, #2048
+
+ ; pass loop processing
+ add r5, r5, #1
+ b idct32_pass_loop
+
+idct32_bands_end_2nd_pass
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+ ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+ ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+ ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+ LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[2] = step1b[2][i] + step1b[13][i];
+ ;step1[3] = step1b[3][i] + step1b[12][i];
+ ;step1[12] = step1b[3][i] - step1b[12][i];
+ ;step1[13] = step1b[2][i] - step1b[13][i];
+ LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+ vadd.s16 q2, q10, q1
+ vadd.s16 q3, q11, q0
+ vsub.s16 q4, q11, q0
+ vsub.s16 q5, q10, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+ ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+ ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+ ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+ LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+ ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+ ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+ ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+ LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[4] = step1b[4][i] + step1b[11][i];
+ ;step1[5] = step1b[5][i] + step1b[10][i];
+ ;step1[10] = step1b[5][i] - step1b[10][i];
+ ;step1[11] = step1b[4][i] - step1b[11][i];
+ LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+ vadd.s16 q2, q12, q1
+ vadd.s16 q3, q13, q0
+ vsub.s16 q4, q13, q0
+ vsub.s16 q5, q12, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+ ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+ ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+ ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+ ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+ ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+ ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+ LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[6] = step1b[6][i] + step1b[9][i];
+ ;step1[7] = step1b[7][i] + step1b[8][i];
+ ;step1[8] = step1b[7][i] - step1b[8][i];
+ ;step1[9] = step1b[6][i] - step1b[9][i];
+ LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+ vadd.s16 q2, q14, q1
+ vadd.s16 q3, q15, q0
+ vsub.s16 q4, q15, q0
+ vsub.s16 q5, q14, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+ ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+ ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+ ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS_LAST
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+ ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+ ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+ ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+ LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS_LAST
+ ; --------------------------------------------------------------------------
+ ; restore pointers to their initial indices for next band pass by
+ ; removing/adding dest_stride * 8. The actual increment by eight
+ ; is taken care of within the _LAST macros.
+ add r6, r6, r2, lsl #3
+ add r9, r9, r2, lsl #3
+ sub r7, r7, r2, lsl #3
+ sub r10, r10, r2, lsl #3
+
+ ; restore r0 by removing the last offset from the last
+ ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+ sub r0, r0, #24*8*2
+ ; restore r1 by removing the last offset from the last
+ ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+ ; advance by 8 columns => 8*2
+ sub r1, r1, #25*32*2 - 8*2
+ ; advance by 8 lines (8*32*2)
+ ; go back by the two pairs from the loop (32*2)
+ add r3, r3, #8*32*2 - 32*2
+
+ ; bands loop processing
+ subs r4, r4, #1
+ bne idct32_bands_loop
+
+ ; stack operation
+ add sp, sp, #512+2048+2048
+ vpop {d8-d15}
+ pop {r4-r11}
+ bx lr
+ ENDP ; |vp9_idct32x32_1024_add_neon|
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
@@ -1,0 +1,68 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_idct4x4_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct4x4_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 4)
+ add r0, r0, #8 ; + (1 <<((4) - 1))
+ asr r0, r0, #4 ; >> 4
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ vld1.32 {d2[0]}, [r1], r2
+ vld1.32 {d2[1]}, [r1], r2
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q8, q0, d2 ; dest[x] + a1
+ vaddw.u8 q9, q0, d4
+
+ vqmovun.s16 d6, q8 ; clip_pixel
+ vqmovun.s16 d7, q9
+
+ vst1.32 {d6[0]}, [r12], r2
+ vst1.32 {d6[1]}, [r12], r2
+ vst1.32 {d7[0]}, [r12], r2
+ vst1.32 {d7[1]}, [r12]
+
+ bx lr
+ ENDP ; |vp9_idct4x4_1_add_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
@@ -1,0 +1,190 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_idct4x4_16_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct4x4_16_add_neon| PROC
+
+ ; The 2D transform is done with two passes which are actually pretty
+ ; similar. We first transform the rows. This is done by transposing
+ ; the inputs, doing an SIMD column transform (the columns are the
+ ; transposed rows) and then transpose the results (so that it goes back
+ ; in normal/row positions). Then, we transform the columns by doing
+ ; another SIMD column transform.
+ ; So, two passes of a transpose followed by a column transform.
+
+ ; load the inputs into q8-q9, d16-d19
+ vld1.s16 {q8,q9}, [r0]!
+
+ ; generate scalar constants
+ ; cospi_8_64 = 15137 = 0x3b21
+ mov r0, #0x3b00
+ add r0, #0x21
+ ; cospi_16_64 = 11585 = 0x2d41
+ mov r3, #0x2d00
+ add r3, #0x41
+ ; cospi_24_64 = 6270 = 0x 187e
+ mov r12, #0x1800
+ add r12, #0x7e
+
+ ; transpose the input data
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ ; generate constant vectors
+ vdup.16 d20, r0 ; replicate cospi_8_64
+ vdup.16 d21, r3 ; replicate cospi_16_64
+
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ vdup.16 d22, r12 ; replicate cospi_24_64
+
+ ; do the transform on transposed rows
+
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q13, d23, d21
+ vmull.s16 q14, d24, d21
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vqrshrn.s32 d26, q13, #14
+ vqrshrn.s32 d27, q14, #14
+ vqrshrn.s32 d29, q15, #14
+ vqrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+ vswp d18, d19
+
+ ; transpose the results
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ ; do the transform on columns
+
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q13, d23, d21
+ vmull.s16 q14, d24, d21
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vqrshrn.s32 d26, q13, #14
+ vqrshrn.s32 d27, q14, #14
+ vqrshrn.s32 d29, q15, #14
+ vqrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+
+ ; The results are in two registers, one of them being swapped. This will
+ ; be taken care of by loading the 'dest' value in a swapped fashion and
+ ; also storing them in the same swapped fashion.
+ ; temp_out[0, 1] = d16, d17 = q8
+ ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+ vld1.32 {d26[0]}, [r1], r2
+ vld1.32 {d26[1]}, [r1], r2
+ vld1.32 {d27[1]}, [r1], r2
+ vld1.32 {d27[0]}, [r1] ; no post-increment
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d26
+ vaddw.u8 q9, q9, d27
+
+ ; clip_pixel
+ vqmovun.s16 d26, q8
+ vqmovun.s16 d27, q9
+
+ ; do the stores in reverse order with negative post-increment, by changing
+ ; the sign of the stride
+ rsb r2, r2, #0
+ vst1.32 {d27[0]}, [r1], r2
+ vst1.32 {d27[1]}, [r1], r2
+ vst1.32 {d26[1]}, [r1], r2
+ vst1.32 {d26[0]}, [r1] ; no post-increment
+ bx lr
+ ENDP ; |vp9_idct4x4_16_add_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
@@ -1,0 +1,88 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_idct8x8_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct8x8_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 5)
+ add r0, r0, #16 ; + (1 <<((5) - 1))
+ asr r0, r0, #5 ; >> 5
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ ; load destination data
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r2
+ vld1.64 {d17}, [r1]
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |vp9_idct8x8_1_add_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
@@ -1,0 +1,519 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_idct8x8_64_add_neon|
+ EXPORT |vp9_idct8x8_10_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+ ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+ ; This macro will touch q0-q7 registers and use them as buffer during
+ ; calculation.
+ MACRO
+ IDCT8x8_1D
+ ; stage 1
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r4 ; duplicate cospi_4_64
+ vdup.16 d2, r5 ; duplicate cospi_12_64
+ vdup.16 d3, r6 ; duplicate cospi_20_64
+
+ ; input[1] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; input[5] * cospi_12_64
+ vmull.s16 q5, d26, d2
+ vmull.s16 q6, d27, d2
+
+ ; input[1]*cospi_28_64-input[7]*cospi_4_64
+ vmlsl.s16 q2, d30, d1
+ vmlsl.s16 q3, d31, d1
+
+ ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vmlsl.s16 q5, d22, d3
+ vmlsl.s16 q6, d23, d3
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d8, q2, #14 ; >> 14
+ vqrshrn.s32 d9, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q5, #14 ; >> 14
+ vqrshrn.s32 d11, q6, #14 ; >> 14
+
+ ; input[1] * cospi_4_64
+ vmull.s16 q2, d18, d1
+ vmull.s16 q3, d19, d1
+
+ ; input[5] * cospi_20_64
+ vmull.s16 q9, d26, d3
+ vmull.s16 q13, d27, d3
+
+ ; input[1]*cospi_4_64+input[7]*cospi_28_64
+ vmlal.s16 q2, d30, d0
+ vmlal.s16 q3, d31, d0
+
+ ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+ vmlal.s16 q9, d22, d2
+ vmlal.s16 q13, d23, d2
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d14, q2, #14 ; >> 14
+ vqrshrn.s32 d15, q3, #14 ; >> 14
+
+ ; stage 2 & stage 3 - even half
+ vdup.16 d0, r7 ; duplicate cospi_16_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q13, #14 ; >> 14
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q2, d16, d0
+ vmull.s16 q3, d17, d0
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q13, d16, d0
+ vmull.s16 q15, d17, d0
+
+ ; (input[0] + input[2]) * cospi_16_64
+ vmlal.s16 q2, d24, d0
+ vmlal.s16 q3, d25, d0
+
+ ; (input[0] - input[2]) * cospi_16_64
+ vmlsl.s16 q13, d24, d0
+ vmlsl.s16 q15, d25, d0
+
+ vdup.16 d0, r8 ; duplicate cospi_24_64
+ vdup.16 d1, r9 ; duplicate cospi_8_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d18, q2, #14 ; >> 14
+ vqrshrn.s32 d19, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d22, q13, #14 ; >> 14
+ vqrshrn.s32 d23, q15, #14 ; >> 14
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ ; input[1] * cospi_24_64
+ vmull.s16 q2, d20, d0
+ vmull.s16 q3, d21, d0
+
+ ; input[1] * cospi_8_64
+ vmull.s16 q8, d20, d1
+ vmull.s16 q12, d21, d1
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlsl.s16 q2, d28, d1
+ vmlsl.s16 q3, d29, d1
+
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vmlal.s16 q8, d28, d0
+ vmlal.s16 q12, d29, d0
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d26, q2, #14 ; >> 14
+ vqrshrn.s32 d27, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d30, q8, #14 ; >> 14
+ vqrshrn.s32 d31, q12, #14 ; >> 14
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7 ; duplicate cospi_16_64
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q11, d28, d16
+ vmull.s16 q12, d29, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vmlsl.s16 q9, d26, d16
+ vmlsl.s16 q10, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vmlal.s16 q11, d26, d16
+ vmlal.s16 q12, d27, d16
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q12, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
+ MEND
+
+ ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct8x8_64_add_neon| PROC
+ push {r4-r9}
+ vpush {d8-d15}
+ vld1.s16 {q8,q9}, [r0]!
+ vld1.s16 {q10,q11}, [r0]!
+ vld1.s16 {q12,q13}, [r0]!
+ vld1.s16 {q14,q15}, [r0]!
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+
+ ; First transform rows
+ IDCT8x8_1D
+
+ ; Transpose the matrix
+ TRANSPOSE8X8
+
+ ; Then transform columns
+ IDCT8x8_1D
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+
+ vpop {d8-d15}
+ pop {r4-r9}
+ bx lr
+ ENDP ; |vp9_idct8x8_64_add_neon|
+
+;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct8x8_10_add_neon| PROC
+ push {r4-r9}
+ vpush {d8-d15}
+ vld1.s16 {q8,q9}, [r0]!
+ vld1.s16 {q10,q11}, [r0]!
+ vld1.s16 {q12,q13}, [r0]!
+ vld1.s16 {q14,q15}, [r0]!
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+
+ ; First transform rows
+ ; stage 1
+ ; The following instructions use vqrdmulh to do the
+ ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+ ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+ ; to double the constants before multiplying to compensate this.
+ mov r12, r3, lsl #1
+ vdup.16 q0, r12 ; duplicate cospi_28_64*2
+ mov r12, r4, lsl #1
+ vdup.16 q1, r12 ; duplicate cospi_4_64*2
+
+ ; dct_const_round_shift(input[1] * cospi_28_64)
+ vqrdmulh.s16 q4, q9, q0
+
+ mov r12, r6, lsl #1
+ rsb r12, #0
+ vdup.16 q0, r12 ; duplicate -cospi_20_64*2
+
+ ; dct_const_round_shift(input[1] * cospi_4_64)
+ vqrdmulh.s16 q7, q9, q1
+
+ mov r12, r5, lsl #1
+ vdup.16 q1, r12 ; duplicate cospi_12_64*2
+
+ ; dct_const_round_shift(- input[3] * cospi_20_64)
+ vqrdmulh.s16 q5, q11, q0
+
+ mov r12, r7, lsl #1
+ vdup.16 q0, r12 ; duplicate cospi_16_64*2
+
+ ; dct_const_round_shift(input[3] * cospi_12_64)
+ vqrdmulh.s16 q6, q11, q1
+
+ ; stage 2 & stage 3 - even half
+ mov r12, r8, lsl #1
+ vdup.16 q1, r12 ; duplicate cospi_24_64*2
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrdmulh.s16 q9, q8, q0
+
+ mov r12, r9, lsl #1
+ vdup.16 q0, r12 ; duplicate cospi_8_64*2
+
+ ; dct_const_round_shift(input[1] * cospi_24_64)
+ vqrdmulh.s16 q13, q10, q1
+
+ ; dct_const_round_shift(input[1] * cospi_8_64)
+ vqrdmulh.s16 q15, q10, q0
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7 ; duplicate cospi_16_64
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q11, d28, d16
+ vmull.s16 q12, d29, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vmlsl.s16 q9, d26, d16
+ vmlsl.s16 q10, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vmlal.s16 q11, d26, d16
+ vmlal.s16 q12, d27, d16
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q12, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
+
+ ; Transpose the matrix
+ TRANSPOSE8X8
+
+ ; Then transform columns
+ IDCT8x8_1D
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+
+ vpop {d8-d15}
+ pop {r4-r9}
+ bx lr
+ ENDP ; |vp9_idct8x8_10_add_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
@@ -1,0 +1,237 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_iht4x4_16_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+ ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+ ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+ ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+ ; them as buffer during calculation.
+ MACRO
+ IDCT4x4_1D
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64
+ vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64
+ vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64
+ vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64
+ vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+ ; dct_const_round_shift
+ vqrshrn.s32 d26, q13, #14
+ vqrshrn.s32 d27, q14, #14
+ vqrshrn.s32 d29, q15, #14
+ vqrshrn.s32 d28, q10, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+ vswp d18, d19
+ MEND
+
+ ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+ ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+ ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+ ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+ ; q14,q15 registers and use them as buffer during calculation.
+ MACRO
+ IADST4x4_1D
+ vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0
+ vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0
+ vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1
+ vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2
+ vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2
+ vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit
+ vaddw.s16 q15, q15, d19 ; x0 + x3
+ vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3
+ vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2
+ vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3
+
+ vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5
+ vadd.s32 q10, q10, q8
+ vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6
+ vdup.32 q8, r0 ; duplicate sinpi_3_9
+ vsub.s32 q11, q11, q9
+ vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7
+
+ vadd.s32 q13, q10, q12 ; s0 = x0 + x3
+ vadd.s32 q10, q10, q11 ; x0 + x1
+ vadd.s32 q14, q11, q12 ; s1 = x1 + x3
+ vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3
+
+ ; dct_const_round_shift
+ vqrshrn.s32 d16, q13, #14
+ vqrshrn.s32 d17, q14, #14
+ vqrshrn.s32 d18, q15, #14
+ vqrshrn.s32 d19, q10, #14
+ MEND
+
+ ; Generate cosine constants in d6 - d8 for the IDCT
+ MACRO
+ GENERATE_COSINE_CONSTANTS
+ ; cospi_8_64 = 15137 = 0x3b21
+ mov r0, #0x3b00
+ add r0, #0x21
+ ; cospi_16_64 = 11585 = 0x2d41
+ mov r3, #0x2d00
+ add r3, #0x41
+ ; cospi_24_64 = 6270 = 0x187e
+ mov r12, #0x1800
+ add r12, #0x7e
+
+ ; generate constant vectors
+ vdup.16 d0, r0 ; duplicate cospi_8_64
+ vdup.16 d1, r3 ; duplicate cospi_16_64
+ vdup.16 d2, r12 ; duplicate cospi_24_64
+ MEND
+
+ ; Generate sine constants in d1 - d4 for the IADST.
+ MACRO
+ GENERATE_SINE_CONSTANTS
+ ; sinpi_1_9 = 5283 = 0x14A3
+ mov r0, #0x1400
+ add r0, #0xa3
+ ; sinpi_2_9 = 9929 = 0x26C9
+ mov r3, #0x2600
+ add r3, #0xc9
+ ; sinpi_4_9 = 15212 = 0x3B6C
+ mov r12, #0x3b00
+ add r12, #0x6c
+
+ ; generate constant vectors
+ vdup.16 d3, r0 ; duplicate sinpi_1_9
+
+ ; sinpi_3_9 = 13377 = 0x3441
+ mov r0, #0x3400
+ add r0, #0x41
+
+ vdup.16 d4, r3 ; duplicate sinpi_2_9
+ vdup.16 d5, r12 ; duplicate sinpi_4_9
+ vdup.16 q3, r0 ; duplicate sinpi_3_9
+ MEND
+
+ ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+ MACRO
+ TRANSPOSE4X4
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+ vtrn.32 q8, q9
+ MEND
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride, int tx_type)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride
+; r3 int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_iht4x4_16_add_neon| PROC
+
+ ; load the inputs into d16-d19
+ vld1.s16 {q8,q9}, [r0]!
+
+ ; transpose the input data
+ TRANSPOSE4X4
+
+ ; decide the type of transform
+ cmp r3, #2
+ beq idct_iadst
+ cmp r3, #3
+ beq iadst_iadst
+
+iadst_idct
+ ; generate constants
+ GENERATE_COSINE_CONSTANTS
+ GENERATE_SINE_CONSTANTS
+
+ ; first transform rows
+ IDCT4x4_1D
+
+ ; transpose the matrix
+ TRANSPOSE4X4
+
+ ; then transform columns
+ IADST4x4_1D
+
+ b end_vp9_iht4x4_16_add_neon
+
+idct_iadst
+ ; generate constants
+ GENERATE_COSINE_CONSTANTS
+ GENERATE_SINE_CONSTANTS
+
+ ; first transform rows
+ IADST4x4_1D
+
+ ; transpose the matrix
+ TRANSPOSE4X4
+
+ ; then transform columns
+ IDCT4x4_1D
+
+ b end_vp9_iht4x4_16_add_neon
+
+iadst_iadst
+ ; generate constants
+ GENERATE_SINE_CONSTANTS
+
+ ; first transform rows
+ IADST4x4_1D
+
+ ; transpose the matrix
+ TRANSPOSE4X4
+
+ ; then transform columns
+ IADST4x4_1D
+
+end_vp9_iht4x4_16_add_neon
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+ vld1.32 {d26[0]}, [r1], r2
+ vld1.32 {d26[1]}, [r1], r2
+ vld1.32 {d27[0]}, [r1], r2
+ vld1.32 {d27[1]}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d26
+ vaddw.u8 q9, q9, d27
+
+ ; clip_pixel
+ vqmovun.s16 d26, q8
+ vqmovun.s16 d27, q9
+
+ ; do the stores in reverse order with negative post-increment, by changing
+ ; the sign of the stride
+ rsb r2, r2, #0
+ vst1.32 {d27[1]}, [r1], r2
+ vst1.32 {d27[0]}, [r1], r2
+ vst1.32 {d26[1]}, [r1], r2
+ vst1.32 {d26[0]}, [r1] ; no post-increment
+ bx lr
+ ENDP ; |vp9_iht4x4_16_add_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
@@ -1,0 +1,696 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_iht8x8_64_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Generate IADST constants in r0 - r12 for the IADST.
+ MACRO
+ GENERATE_IADST_CONSTANTS
+ ; generate cospi_2_64 = 16305
+ mov r0, #0x3f00
+ add r0, #0xb1
+
+ ; generate cospi_30_64 = 1606
+ mov r1, #0x600
+ add r1, #0x46
+
+ ; generate cospi_10_64 = 14449
+ mov r2, #0x3800
+ add r2, #0x71
+
+ ; generate cospi_22_64 = 7723
+ mov r3, #0x1e00
+ add r3, #0x2b
+
+ ; generate cospi_18_64 = 10394
+ mov r4, #0x2800
+ add r4, #0x9a
+
+ ; generate cospi_14_64 = 12665
+ mov r5, #0x3100
+ add r5, #0x79
+
+ ; generate cospi_26_64 = 4756
+ mov r6, #0x1200
+ add r6, #0x94
+
+ ; generate cospi_6_64 = 15679
+ mov r7, #0x3d00
+ add r7, #0x3f
+
+ ; generate cospi_8_64 = 15137
+ mov r8, #0x3b00
+ add r8, #0x21
+
+ ; generate cospi_24_64 = 6270
+ mov r9, #0x1800
+ add r9, #0x7e
+
+ ; generate 0
+ mov r10, #0
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+ MEND
+
+ ; Generate IDCT constants in r3 - r9 for the IDCT.
+ MACRO
+ GENERATE_IDCT_CONSTANTS
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+ MEND
+
+ ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
+ ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
+ ; will be stored back into q8-q15 registers. This macro will touch q0-q7
+ ; registers and use them as buffer during calculation.
+ MACRO
+ IDCT8x8_1D
+ ; stage 1
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r4 ; duplicate cospi_4_64
+ vdup.16 d2, r5 ; duplicate cospi_12_64
+ vdup.16 d3, r6 ; duplicate cospi_20_64
+
+ ; input[1] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; input[5] * cospi_12_64
+ vmull.s16 q5, d26, d2
+ vmull.s16 q6, d27, d2
+
+ ; input[1]*cospi_28_64-input[7]*cospi_4_64
+ vmlsl.s16 q2, d30, d1
+ vmlsl.s16 q3, d31, d1
+
+ ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vmlsl.s16 q5, d22, d3
+ vmlsl.s16 q6, d23, d3
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d8, q2, #14 ; >> 14
+ vqrshrn.s32 d9, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q5, #14 ; >> 14
+ vqrshrn.s32 d11, q6, #14 ; >> 14
+
+ ; input[1] * cospi_4_64
+ vmull.s16 q2, d18, d1
+ vmull.s16 q3, d19, d1
+
+ ; input[5] * cospi_20_64
+ vmull.s16 q9, d26, d3
+ vmull.s16 q13, d27, d3
+
+ ; input[1]*cospi_4_64+input[7]*cospi_28_64
+ vmlal.s16 q2, d30, d0
+ vmlal.s16 q3, d31, d0
+
+ ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+ vmlal.s16 q9, d22, d2
+ vmlal.s16 q13, d23, d2
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d14, q2, #14 ; >> 14
+ vqrshrn.s32 d15, q3, #14 ; >> 14
+
+ ; stage 2 & stage 3 - even half
+ vdup.16 d0, r7 ; duplicate cospi_16_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q13, #14 ; >> 14
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q2, d16, d0
+ vmull.s16 q3, d17, d0
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q13, d16, d0
+ vmull.s16 q15, d17, d0
+
+ ; (input[0] + input[2]) * cospi_16_64
+ vmlal.s16 q2, d24, d0
+ vmlal.s16 q3, d25, d0
+
+ ; (input[0] - input[2]) * cospi_16_64
+ vmlsl.s16 q13, d24, d0
+ vmlsl.s16 q15, d25, d0
+
+ vdup.16 d0, r8 ; duplicate cospi_24_64
+ vdup.16 d1, r9 ; duplicate cospi_8_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d18, q2, #14 ; >> 14
+ vqrshrn.s32 d19, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d22, q13, #14 ; >> 14
+ vqrshrn.s32 d23, q15, #14 ; >> 14
+
+ ; input[1] * cospi_24_64
+ vmull.s16 q2, d20, d0
+ vmull.s16 q3, d21, d0
+
+ ; input[1] * cospi_8_64
+ vmull.s16 q8, d20, d1
+ vmull.s16 q12, d21, d1
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlsl.s16 q2, d28, d1
+ vmlsl.s16 q3, d29, d1
+
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vmlal.s16 q8, d28, d0
+ vmlal.s16 q12, d29, d0
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d26, q2, #14 ; >> 14
+ vqrshrn.s32 d27, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d30, q8, #14 ; >> 14
+ vqrshrn.s32 d31, q12, #14 ; >> 14
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7 ; duplicate cospi_16_64
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q11, d28, d16
+ vmull.s16 q12, d29, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vmlsl.s16 q9, d26, d16
+ vmlsl.s16 q10, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vmlal.s16 q11, d26, d16
+ vmlal.s16 q12, d27, d16
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q12, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
+ MEND
+
+ ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
+ ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
+ ; output will be stored back into q8-q15 registers. This macro will touch
+ ; q0 - q7 registers and use them as buffer during calculation.
+ MACRO
+ IADST8X8_1D
+ vdup.16 d14, r0 ; duplicate cospi_2_64
+ vdup.16 d15, r1 ; duplicate cospi_30_64
+
+ ; cospi_2_64 * x0
+ vmull.s16 q1, d30, d14
+ vmull.s16 q2, d31, d14
+
+ ; cospi_30_64 * x0
+ vmull.s16 q3, d30, d15
+ vmull.s16 q4, d31, d15
+
+ vdup.16 d30, r4 ; duplicate cospi_18_64
+ vdup.16 d31, r5 ; duplicate cospi_14_64
+
+ ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ vmlal.s16 q1, d16, d15
+ vmlal.s16 q2, d17, d15
+
+ ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1
+ vmlsl.s16 q3, d16, d14
+ vmlsl.s16 q4, d17, d14
+
+ ; cospi_18_64 * x4
+ vmull.s16 q5, d22, d30
+ vmull.s16 q6, d23, d30
+
+ ; cospi_14_64 * x4
+ vmull.s16 q7, d22, d31
+ vmull.s16 q8, d23, d31
+
+ ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ vmlal.s16 q5, d24, d31
+ vmlal.s16 q6, d25, d31
+
+ ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
+ vmlsl.s16 q7, d24, d30
+ vmlsl.s16 q8, d25, d30
+
+ ; (s0 + s4)
+ vadd.s32 q11, q1, q5
+ vadd.s32 q12, q2, q6
+
+ vdup.16 d0, r2 ; duplicate cospi_10_64
+ vdup.16 d1, r3 ; duplicate cospi_22_64
+
+ ; (s0 - s4)
+ vsub.s32 q1, q1, q5
+ vsub.s32 q2, q2, q6
+
+ ; x0 = dct_const_round_shift(s0 + s4);
+ vqrshrn.s32 d22, q11, #14 ; >> 14
+ vqrshrn.s32 d23, q12, #14 ; >> 14
+
+ ; (s1 + s5)
+ vadd.s32 q12, q3, q7
+ vadd.s32 q15, q4, q8
+
+ ; (s1 - s5)
+ vsub.s32 q3, q3, q7
+ vsub.s32 q4, q4, q8
+
+ ; x4 = dct_const_round_shift(s0 - s4);
+ vqrshrn.s32 d2, q1, #14 ; >> 14
+ vqrshrn.s32 d3, q2, #14 ; >> 14
+
+ ; x1 = dct_const_round_shift(s1 + s5);
+ vqrshrn.s32 d24, q12, #14 ; >> 14
+ vqrshrn.s32 d25, q15, #14 ; >> 14
+
+ ; x5 = dct_const_round_shift(s1 - s5);
+ vqrshrn.s32 d6, q3, #14 ; >> 14
+ vqrshrn.s32 d7, q4, #14 ; >> 14
+
+ ; cospi_10_64 * x2
+ vmull.s16 q4, d26, d0
+ vmull.s16 q5, d27, d0
+
+ ; cospi_22_64 * x2
+ vmull.s16 q2, d26, d1
+ vmull.s16 q6, d27, d1
+
+ vdup.16 d30, r6 ; duplicate cospi_26_64
+ vdup.16 d31, r7 ; duplicate cospi_6_64
+
+ ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ vmlal.s16 q4, d20, d1
+ vmlal.s16 q5, d21, d1
+
+ ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ vmlsl.s16 q2, d20, d0
+ vmlsl.s16 q6, d21, d0
+
+ ; cospi_26_64 * x6
+ vmull.s16 q0, d18, d30
+ vmull.s16 q13, d19, d30
+
+ ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ vmlal.s16 q0, d28, d31
+ vmlal.s16 q13, d29, d31
+
+ ; cospi_6_64 * x6
+ vmull.s16 q10, d18, d31
+ vmull.s16 q9, d19, d31
+
+ ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ vmlsl.s16 q10, d28, d30
+ vmlsl.s16 q9, d29, d30
+
+ ; (s3 + s7)
+ vadd.s32 q14, q2, q10
+ vadd.s32 q15, q6, q9
+
+ ; (s3 - s7)
+ vsub.s32 q2, q2, q10
+ vsub.s32 q6, q6, q9
+
+ ; x3 = dct_const_round_shift(s3 + s7);
+ vqrshrn.s32 d28, q14, #14 ; >> 14
+ vqrshrn.s32 d29, q15, #14 ; >> 14
+
+ ; x7 = dct_const_round_shift(s3 - s7);
+ vqrshrn.s32 d4, q2, #14 ; >> 14
+ vqrshrn.s32 d5, q6, #14 ; >> 14
+
+ ; (s2 + s6)
+ vadd.s32 q9, q4, q0
+ vadd.s32 q10, q5, q13
+
+ ; (s2 - s6)
+ vsub.s32 q4, q4, q0
+ vsub.s32 q5, q5, q13
+
+ vdup.16 d30, r8 ; duplicate cospi_8_64
+ vdup.16 d31, r9 ; duplicate cospi_24_64
+
+ ; x2 = dct_const_round_shift(s2 + s6);
+ vqrshrn.s32 d18, q9, #14 ; >> 14
+ vqrshrn.s32 d19, q10, #14 ; >> 14
+
+ ; x6 = dct_const_round_shift(s2 - s6);
+ vqrshrn.s32 d8, q4, #14 ; >> 14
+ vqrshrn.s32 d9, q5, #14 ; >> 14
+
+ ; cospi_8_64 * x4
+ vmull.s16 q5, d2, d30
+ vmull.s16 q6, d3, d30
+
+ ; cospi_24_64 * x4
+ vmull.s16 q7, d2, d31
+ vmull.s16 q0, d3, d31
+
+ ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ vmlal.s16 q5, d6, d31
+ vmlal.s16 q6, d7, d31
+
+ ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ vmlsl.s16 q7, d6, d30
+ vmlsl.s16 q0, d7, d30
+
+ ; cospi_8_64 * x7
+ vmull.s16 q1, d4, d30
+ vmull.s16 q3, d5, d30
+
+ ; cospi_24_64 * x7
+ vmull.s16 q10, d4, d31
+ vmull.s16 q2, d5, d31
+
+ ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ vmlsl.s16 q1, d8, d31
+ vmlsl.s16 q3, d9, d31
+
+ ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ vmlal.s16 q10, d8, d30
+ vmlal.s16 q2, d9, d30
+
+ vadd.s16 q8, q11, q9 ; x0 = s0 + s2;
+
+ vsub.s16 q11, q11, q9 ; x2 = s0 - s2;
+
+ vadd.s16 q4, q12, q14 ; x1 = s1 + s3;
+
+ vsub.s16 q12, q12, q14 ; x3 = s1 - s3;
+
+ ; (s4 + s6)
+ vadd.s32 q14, q5, q1
+ vadd.s32 q15, q6, q3
+
+ ; (s4 - s6)
+ vsub.s32 q5, q5, q1
+ vsub.s32 q6, q6, q3
+
+ ; x4 = dct_const_round_shift(s4 + s6);
+ vqrshrn.s32 d18, q14, #14 ; >> 14
+ vqrshrn.s32 d19, q15, #14 ; >> 14
+
+ ; x6 = dct_const_round_shift(s4 - s6);
+ vqrshrn.s32 d10, q5, #14 ; >> 14
+ vqrshrn.s32 d11, q6, #14 ; >> 14
+
+ ; (s5 + s7)
+ vadd.s32 q1, q7, q10
+ vadd.s32 q3, q0, q2
+
+ ; (s5 - s7))
+ vsub.s32 q7, q7, q10
+ vsub.s32 q0, q0, q2
+
+ ; x5 = dct_const_round_shift(s5 + s7);
+ vqrshrn.s32 d28, q1, #14 ; >> 14
+ vqrshrn.s32 d29, q3, #14 ; >> 14
+
+ ; x7 = dct_const_round_shift(s5 - s7);
+ vqrshrn.s32 d14, q7, #14 ; >> 14
+ vqrshrn.s32 d15, q0, #14 ; >> 14
+
+ vdup.16 d30, r12 ; duplicate cospi_16_64
+
+ ; cospi_16_64 * x2
+ vmull.s16 q2, d22, d30
+ vmull.s16 q3, d23, d30
+
+ ; cospi_6_64 * x6
+ vmull.s16 q13, d22, d30
+ vmull.s16 q1, d23, d30
+
+ ; cospi_16_64 * x2 + cospi_16_64 * x3;
+ vmlal.s16 q2, d24, d30
+ vmlal.s16 q3, d25, d30
+
+ ; cospi_16_64 * x2 - cospi_16_64 * x3;
+ vmlsl.s16 q13, d24, d30
+ vmlsl.s16 q1, d25, d30
+
+ ; x2 = dct_const_round_shift(s2);
+ vqrshrn.s32 d4, q2, #14 ; >> 14
+ vqrshrn.s32 d5, q3, #14 ; >> 14
+
+ ;x3 = dct_const_round_shift(s3);
+ vqrshrn.s32 d24, q13, #14 ; >> 14
+ vqrshrn.s32 d25, q1, #14 ; >> 14
+
+ ; cospi_16_64 * x6
+ vmull.s16 q13, d10, d30
+ vmull.s16 q1, d11, d30
+
+ ; cospi_6_64 * x6
+ vmull.s16 q11, d10, d30
+ vmull.s16 q0, d11, d30
+
+ ; cospi_16_64 * x6 + cospi_16_64 * x7;
+ vmlal.s16 q13, d14, d30
+ vmlal.s16 q1, d15, d30
+
+ ; cospi_16_64 * x6 - cospi_16_64 * x7;
+ vmlsl.s16 q11, d14, d30
+ vmlsl.s16 q0, d15, d30
+
+ ; x6 = dct_const_round_shift(s6);
+ vqrshrn.s32 d20, q13, #14 ; >> 14
+ vqrshrn.s32 d21, q1, #14 ; >> 14
+
+ ;x7 = dct_const_round_shift(s7);
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q0, #14 ; >> 14
+
+ vdup.16 q5, r10 ; duplicate 0
+
+ vsub.s16 q9, q5, q9 ; output[1] = -x4;
+ vsub.s16 q11, q5, q2 ; output[3] = -x2;
+ vsub.s16 q13, q5, q6 ; output[5] = -x7;
+ vsub.s16 q15, q5, q4 ; output[7] = -x1;
+ MEND
+
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride, int tx_type)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride
+; r3 int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_iht8x8_64_add_neon| PROC
+
+ ; load the inputs into d16-d19
+ vld1.s16 {q8,q9}, [r0]!
+ vld1.s16 {q10,q11}, [r0]!
+ vld1.s16 {q12,q13}, [r0]!
+ vld1.s16 {q14,q15}, [r0]!
+
+ push {r0-r10}
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; decide the type of transform
+ cmp r3, #2
+ beq idct_iadst
+ cmp r3, #3
+ beq iadst_iadst
+
+iadst_idct
+ ; generate IDCT constants
+ GENERATE_IDCT_CONSTANTS
+
+ ; first transform rows
+ IDCT8x8_1D
+
+ ; transpose the matrix
+ TRANSPOSE8X8
+
+ ; generate IADST constants
+ GENERATE_IADST_CONSTANTS
+
+ ; then transform columns
+ IADST8X8_1D
+
+ b end_vp9_iht8x8_64_add_neon
+
+idct_iadst
+ ; generate IADST constants
+ GENERATE_IADST_CONSTANTS
+
+ ; first transform rows
+ IADST8X8_1D
+
+ ; transpose the matrix
+ TRANSPOSE8X8
+
+ ; generate IDCT constants
+ GENERATE_IDCT_CONSTANTS
+
+ ; then transform columns
+ IDCT8x8_1D
+
+ b end_vp9_iht8x8_64_add_neon
+
+iadst_iadst
+ ; generate IADST constants
+ GENERATE_IADST_CONSTANTS
+
+ ; first transform rows
+ IADST8X8_1D
+
+ ; transpose the matrix
+ TRANSPOSE8X8
+
+ ; then transform columns
+ IADST8X8_1D
+
+end_vp9_iht8x8_64_add_neon
+ pop {r0-r10}
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+ bx lr
+ ENDP ; |vp9_iht8x8_64_add_neon|
+
+ END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ /dev/null
@@ -1,198 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp9_idct16x16_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct16x16_1_add_neon| PROC
- ldrsh r0, [r0]
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 6)
- add r0, r0, #32 ; + (1 <<((6) - 1))
- asr r0, r0, #6 ; >> 6
-
- vdup.s16 q0, r0 ; duplicate a1
- mov r0, #8
- sub r2, #8
-
- ; load destination data row0 - row3
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- ; load destination data row4 - row7
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- ; load destination data row8 - row11
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- ; load destination data row12 - row15
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- bx lr
- ENDP ; |vp9_idct16x16_1_add_neon|
-
- END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ /dev/null
@@ -1,1179 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_idct16x16_256_add_neon_pass1|
- EXPORT |vp9_idct16x16_256_add_neon_pass2|
- EXPORT |vp9_idct16x16_10_add_neon_pass1|
- EXPORT |vp9_idct16x16_10_add_neon_pass2|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
-; int16_t *output, int output_stride)
-;
-; r0 int16_t input
-; r1 int16_t *output
-; r2 int output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vp9_idct16x16_256_add_neon_pass1| PROC
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q1,q2}, [r0]!
- vmov.s16 q15, q1
-
- ; generate cospi_28_64 = 3196
- mov r3, #0xc00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r12, #0x3e00
- add r12, #0xc5
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r12 ; duplicate cospi_4_64
-
- ; preloading to avoid stall
- ; generate cospi_12_64 = 13623
- mov r3, #0x3500
- add r3, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r12, #0x2300
- add r12, #0x8e
-
- ; step2[4] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; step2[4] * cospi_4_64
- vmull.s16 q5, d18, d1
- vmull.s16 q6, d19, d1
-
- ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
- vmlal.s16 q5, d30, d0
- vmlal.s16 q6, d31, d0
-
- vdup.16 d2, r3 ; duplicate cospi_12_64
- vdup.16 d3, r12 ; duplicate cospi_20_64
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d14, q5, #14 ; >> 14
- vqrshrn.s32 d15, q6, #14 ; >> 14
-
- ; preloading to avoid stall
- ; generate cospi_16_64 = 11585
- mov r3, #0x2d00
- add r3, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r12, #0x1800
- add r12, #0x7e
-
- ; step2[5] * cospi_12_64
- vmull.s16 q2, d26, d2
- vmull.s16 q3, d27, d2
-
- ; step2[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q15, d27, d3
-
- ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q2, d22, d3
- vmlsl.s16 q3, d23, d3
-
- ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q15, d23, d2
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d10, q2, #14 ; >> 14
- vqrshrn.s32 d11, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q15, #14 ; >> 14
-
- ; stage 4
- vdup.16 d30, r3 ; cospi_16_64
-
- ; step1[0] * cospi_16_64
- vmull.s16 q2, d16, d30
- vmull.s16 q11, d17, d30
-
- ; step1[1] * cospi_16_64
- vmull.s16 q0, d24, d30
- vmull.s16 q1, d25, d30
-
- ; generate cospi_8_64 = 15137
- mov r3, #0x3b00
- add r3, #0x21
-
- vdup.16 d30, r12 ; duplicate cospi_24_64
- vdup.16 d31, r3 ; duplicate cospi_8_64
-
- ; temp1 = (step1[0] + step1[1]) * cospi_16_64
- vadd.s32 q3, q2, q0
- vadd.s32 q12, q11, q1
-
- ; temp2 = (step1[0] - step1[1]) * cospi_16_64
- vsub.s32 q13, q2, q0
- vsub.s32 q1, q11, q1
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d16, q3, #14 ; >> 14
- vqrshrn.s32 d17, q12, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d18, q13, #14 ; >> 14
- vqrshrn.s32 d19, q1, #14 ; >> 14
-
- ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- ; step1[2] * cospi_8_64
- vmull.s16 q0, d20, d31
- vmull.s16 q1, d21, d31
-
- ; step1[2] * cospi_24_64
- vmull.s16 q12, d20, d30
- vmull.s16 q13, d21, d30
-
- ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q0, d28, d30
- vmlal.s16 q1, d29, d30
-
- ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q12, d28, d31
- vmlsl.s16 q13, d29, d31
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d22, q0, #14 ; >> 14
- vqrshrn.s32 d23, q1, #14 ; >> 14
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d20, q12, #14 ; >> 14
- vqrshrn.s32 d21, q13, #14 ; >> 14
-
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5];
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5];
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7];
- vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7];
-
- ; generate cospi_16_64 = 11585
- mov r3, #0x2d00
- add r3, #0x41
-
- ; stage 5
- vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3];
- vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2];
- vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2];
- vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3];
-
- vdup.16 d16, r3; ; duplicate cospi_16_64
-
- ; step2[5] * cospi_16_64
- vmull.s16 q11, d26, d16
- vmull.s16 q12, d27, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; temp1 = (step2[6] - step2[5]) * cospi_16_64
- vsub.s32 q6, q9, q11
- vsub.s32 q13, q10, q12
-
- ; temp2 = (step2[5] + step2[6]) * cospi_16_64
- vadd.s32 q9, q9, q11
- vadd.s32 q10, q10, q12
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d10, q6, #14 ; >> 14
- vqrshrn.s32 d11, q13, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q10, #14 ; >> 14
-
- ; stage 6
- vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7];
-
- ; store the data
- vst1.64 {d16}, [r1], r2
- vst1.64 {d17}, [r1], r2
- vst1.64 {d18}, [r1], r2
- vst1.64 {d19}, [r1], r2
- vst1.64 {d20}, [r1], r2
- vst1.64 {d21}, [r1], r2
- vst1.64 {d22}, [r1], r2
- vst1.64 {d23}, [r1], r2
- vst1.64 {d24}, [r1], r2
- vst1.64 {d25}, [r1], r2
- vst1.64 {d26}, [r1], r2
- vst1.64 {d27}, [r1], r2
- vst1.64 {d28}, [r1], r2
- vst1.64 {d29}, [r1], r2
- vst1.64 {d30}, [r1], r2
- vst1.64 {d31}, [r1], r2
-
- bx lr
- ENDP ; |vp9_idct16x16_256_add_neon_pass1|
-
-;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
-; int16_t *output,
-; int16_t *pass1Output,
-; int16_t skip_adding,
-; uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t *src
-; r1 int16_t *output,
-; r2 int16_t *pass1Output,
-; r3 int16_t skip_adding,
-; r4 uint8_t *dest,
-; r5 int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vp9_idct16x16_256_add_neon_pass2| PROC
- push {r3-r9}
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q0,q1}, [r0]!
- vmov.s16 q15, q0;
-
- ; generate cospi_30_64 = 1606
- mov r3, #0x0600
- add r3, #0x46
-
- ; generate cospi_2_64 = 16305
- mov r12, #0x3f00
- add r12, #0xb1
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 d12, r3 ; duplicate cospi_30_64
- vdup.16 d13, r12 ; duplicate cospi_2_64
-
- ; preloading to avoid stall
- ; generate cospi_14_64 = 12665
- mov r3, #0x3100
- add r3, #0x79
-
- ; generate cospi_18_64 = 10394
- mov r12, #0x2800
- add r12, #0x9a
-
- ; step1[8] * cospi_30_64
- vmull.s16 q2, d16, d12
- vmull.s16 q3, d17, d12
-
- ; step1[8] * cospi_2_64
- vmull.s16 q1, d16, d13
- vmull.s16 q4, d17, d13
-
- ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
- vmlsl.s16 q2, d30, d13
- vmlsl.s16 q3, d31, d13
-
- ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
- vmlal.s16 q1, d30, d12
- vmlal.s16 q4, d31, d12
-
- vdup.16 d30, r3 ; duplicate cospi_14_64
- vdup.16 d31, r12 ; duplicate cospi_18_64
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d0, q2, #14 ; >> 14
- vqrshrn.s32 d1, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d14, q1, #14 ; >> 14
- vqrshrn.s32 d15, q4, #14 ; >> 14
-
- ; preloading to avoid stall
- ; generate cospi_22_64 = 7723
- mov r3, #0x1e00
- add r3, #0x2b
-
- ; generate cospi_10_64 = 14449
- mov r12, #0x3800
- add r12, #0x71
-
- ; step1[9] * cospi_14_64
- vmull.s16 q2, d24, d30
- vmull.s16 q3, d25, d30
-
- ; step1[9] * cospi_18_64
- vmull.s16 q4, d24, d31
- vmull.s16 q5, d25, d31
-
- ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
- vmlsl.s16 q2, d22, d31
- vmlsl.s16 q3, d23, d31
-
- ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
- vmlal.s16 q4, d22, d30
- vmlal.s16 q5, d23, d30
-
- vdup.16 d30, r3 ; duplicate cospi_22_64
- vdup.16 d31, r12 ; duplicate cospi_10_64
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d2, q2, #14 ; >> 14
- vqrshrn.s32 d3, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q4, #14 ; >> 14
- vqrshrn.s32 d13, q5, #14 ; >> 14
-
- ; step1[10] * cospi_22_64
- vmull.s16 q11, d20, d30
- vmull.s16 q12, d21, d30
-
- ; step1[10] * cospi_10_64
- vmull.s16 q4, d20, d31
- vmull.s16 q5, d21, d31
-
- ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
- vmlsl.s16 q11, d26, d31
- vmlsl.s16 q12, d27, d31
-
- ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
- vmlal.s16 q4, d26, d30
- vmlal.s16 q5, d27, d30
-
- ; preloading to avoid stall
- ; generate cospi_6_64 = 15679
- mov r3, #0x3d00
- add r3, #0x3f
-
- ; generate cospi_26_64 = 4756
- mov r12, #0x1200
- add r12, #0x94
-
- vdup.16 d30, r3 ; duplicate cospi_6_64
- vdup.16 d31, r12 ; duplicate cospi_26_64
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q11, #14 ; >> 14
- vqrshrn.s32 d5, q12, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d11, q5, #14 ; >> 14
- vqrshrn.s32 d10, q4, #14 ; >> 14
-
- ; step1[11] * cospi_6_64
- vmull.s16 q10, d28, d30
- vmull.s16 q11, d29, d30
-
- ; step1[11] * cospi_26_64
- vmull.s16 q12, d28, d31
- vmull.s16 q13, d29, d31
-
- ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
- vmlsl.s16 q10, d18, d31
- vmlsl.s16 q11, d19, d31
-
- ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
- vmlal.s16 q12, d18, d30
- vmlal.s16 q13, d19, d30
-
- vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9]
- vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9]
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d6, q10, #14 ; >> 14
- vqrshrn.s32 d7, q11, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d8, q12, #14 ; >> 14
- vqrshrn.s32 d9, q13, #14 ; >> 14
-
- ; stage 3
- vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11]
- vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11]
- vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13]
- vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13]
- vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15]
- vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15]
-
- ; stage 4
- ; generate cospi_24_64 = 6270
- mov r3, #0x1800
- add r3, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r12, #0x3b00
- add r12, #0x21
-
- ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
- vdup.16 d30, r12 ; duplicate cospi_8_64
- vdup.16 d31, r3 ; duplicate cospi_24_64
-
- ; step1[9] * cospi_24_64
- vmull.s16 q2, d18, d31
- vmull.s16 q3, d19, d31
-
- ; step1[14] * cospi_24_64
- vmull.s16 q4, d28, d31
- vmull.s16 q5, d29, d31
-
- ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
- vmlal.s16 q2, d28, d30
- vmlal.s16 q3, d29, d30
-
- ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
- vmlsl.s16 q4, d18, d30
- vmlsl.s16 q5, d19, d30
-
- rsb r12, #0
- vdup.16 d30, r12 ; duplicate -cospi_8_64
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q2, #14 ; >> 14
- vqrshrn.s32 d13, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d2, q4, #14 ; >> 14
- vqrshrn.s32 d3, q5, #14 ; >> 14
-
- vmov.s16 q3, q11
- vmov.s16 q4, q12
-
- ; - step1[13] * cospi_8_64
- vmull.s16 q11, d26, d30
- vmull.s16 q12, d27, d30
-
- ; -step1[10] * cospi_8_64
- vmull.s16 q8, d20, d30
- vmull.s16 q9, d21, d30
-
- ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
- vmlsl.s16 q11, d20, d31
- vmlsl.s16 q12, d21, d31
-
- ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
- vmlal.s16 q8, d26, d31
- vmlal.s16 q9, d27, d31
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d4, q11, #14 ; >> 14
- vqrshrn.s32 d5, q12, #14 ; >> 14
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d10, q8, #14 ; >> 14
- vqrshrn.s32 d11, q9, #14 ; >> 14
-
- ; stage 5
- vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
- vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
- vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
- vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
- vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
- vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
- vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
- vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
-
- ; stage 6.
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- vdup.16 d14, r12 ; duplicate cospi_16_64
-
- ; step1[13] * cospi_16_64
- vmull.s16 q3, d26, d14
- vmull.s16 q4, d27, d14
-
- ; step1[10] * cospi_16_64
- vmull.s16 q0, d20, d14
- vmull.s16 q1, d21, d14
-
- ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
- vsub.s32 q5, q3, q0
- vsub.s32 q6, q4, q1
-
- ; temp2 = (step1[10] + step1[13]) * cospi_16_64
- vadd.s32 q10, q3, q0
- vadd.s32 q4, q4, q1
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q5, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d10, q10, #14 ; >> 14
- vqrshrn.s32 d11, q4, #14 ; >> 14
-
- ; step1[11] * cospi_16_64
- vmull.s16 q0, d22, d14
- vmull.s16 q1, d23, d14
-
- ; step1[12] * cospi_16_64
- vmull.s16 q13, d24, d14
- vmull.s16 q6, d25, d14
-
- ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
- vsub.s32 q10, q13, q0
- vsub.s32 q4, q6, q1
-
- ; temp2 = (step1[11] + step1[12]) * cospi_16_64
- vadd.s32 q13, q13, q0
- vadd.s32 q6, q6, q1
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d6, q10, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d8, q13, #14 ; >> 14
- vqrshrn.s32 d9, q6, #14 ; >> 14
-
- mov r4, #16 ; pass1Output stride
- ldr r3, [sp] ; load skip_adding
- cmp r3, #0 ; check if need adding dest data
- beq skip_adding_dest
-
- ldr r7, [sp, #28] ; dest used to save element 0-7
- mov r9, r7 ; save dest pointer for later use
- ldr r8, [sp, #32] ; load dest_stride
-
- ; stage 7
- ; load the data in pass1
- vld1.s16 {q0}, [r2], r4 ; load data step2[0]
- vld1.s16 {q1}, [r2], r4 ; load data step2[1]
- vld1.s16 {q10}, [r2], r4 ; load data step2[2]
- vld1.s16 {q11}, [r2], r4 ; load data step2[3]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
- vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
- vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
- vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
- vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
- vld1.s16 {q0}, [r2], r4 ; load data step2[4]
- vld1.s16 {q1}, [r2], r4 ; load data step2[5]
- vld1.s16 {q10}, [r2], r4 ; load data step2[6]
- vld1.s16 {q11}, [r2], r4 ; load data step2[7]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
- vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
- vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
- vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
- vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
-
- ; store the data output 8,9,10,11,12,13,14,15
- vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q8 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q9, q9, #6
- vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d13, q9 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vrshr.s16 q2, q2, #6
- vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q2 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q3, q3, #6
- vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d13, q3 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vrshr.s16 q4, q4, #6
- vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q4 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q5, q5, #6
- vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d13, q5 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vrshr.s16 q14, q14, #6
- vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i]
- vqmovun.s16 d12, q14 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q15, q15, #6
- vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i]
- vqmovun.s16 d13, q15 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- b end_idct16x16_pass2
-
-skip_adding_dest
- ; stage 7
- ; load the data in pass1
- mov r5, #24
- mov r3, #8
-
- vld1.s16 {q0}, [r2], r4 ; load data step2[0]
- vld1.s16 {q1}, [r2], r4 ; load data step2[1]
- vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
- vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
- vld1.s16 {q10}, [r2], r4 ; load data step2[2]
- vld1.s16 {q11}, [r2], r4 ; load data step2[3]
- vst1.64 {d24}, [r1], r3 ; store output[0]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[1]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
- vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
- vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
- vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
- vst1.64 {d24}, [r1], r3 ; store output[2]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[3]
- vst1.64 {d27}, [r1], r5
- vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
- vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
- vld1.s16 {q0}, [r2], r4 ; load data step2[4]
- vld1.s16 {q1}, [r2], r4 ; load data step2[5]
- vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
- vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
- vld1.s16 {q10}, [r2], r4 ; load data step2[6]
- vld1.s16 {q11}, [r2], r4 ; load data step2[7]
- vst1.64 {d24}, [r1], r3 ; store output[4]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[5]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
- vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
- vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
- vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
- vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
- vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
- vst1.64 {d24}, [r1], r3 ; store output[6]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[7]
- vst1.64 {d27}, [r1], r5
-
- ; store the data output 8,9,10,11,12,13,14,15
- vst1.64 {d16}, [r1], r3
- vst1.64 {d17}, [r1], r5
- vst1.64 {d18}, [r1], r3
- vst1.64 {d19}, [r1], r5
- vst1.64 {d4}, [r1], r3
- vst1.64 {d5}, [r1], r5
- vst1.64 {d6}, [r1], r3
- vst1.64 {d7}, [r1], r5
- vst1.64 {d8}, [r1], r3
- vst1.64 {d9}, [r1], r5
- vst1.64 {d10}, [r1], r3
- vst1.64 {d11}, [r1], r5
- vst1.64 {d28}, [r1], r3
- vst1.64 {d29}, [r1], r5
- vst1.64 {d30}, [r1], r3
- vst1.64 {d31}, [r1], r5
-end_idct16x16_pass2
- pop {r3-r9}
- bx lr
- ENDP ; |vp9_idct16x16_256_add_neon_pass2|
-
-;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
-; int16_t *output, int output_stride)
-;
-; r0 int16_t input
-; r1 int16_t *output
-; r2 int output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vp9_idct16x16_10_add_neon_pass1| PROC
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q1,q2}, [r0]!
- vmov.s16 q15, q1
-
- ; generate cospi_28_64*2 = 6392
- mov r3, #0x1800
- add r3, #0xf8
-
- ; generate cospi_4_64*2 = 32138
- mov r12, #0x7d00
- add r12, #0x8a
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 q0, r3 ; duplicate cospi_28_64*2
- vdup.16 q1, r12 ; duplicate cospi_4_64*2
-
- ; The following instructions use vqrdmulh to do the
- ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
- ; double, and return the high 16 bits, effectively giving >> 15. Doubling
- ; the constant will change this to >> 14.
- ; dct_const_round_shift(step2[4] * cospi_28_64);
- vqrdmulh.s16 q4, q9, q0
-
- ; preloading to avoid stall
- ; generate cospi_16_64*2 = 23170
- mov r3, #0x5a00
- add r3, #0x82
-
- ; dct_const_round_shift(step2[4] * cospi_4_64);
- vqrdmulh.s16 q7, q9, q1
-
- ; stage 4
- vdup.16 q1, r3 ; cospi_16_64*2
-
- ; generate cospi_16_64 = 11585
- mov r3, #0x2d00
- add r3, #0x41
-
- vdup.16 d4, r3; ; duplicate cospi_16_64
-
- ; dct_const_round_shift(step1[0] * cospi_16_64)
- vqrdmulh.s16 q8, q8, q1
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d14, d4
- vmull.s16 q10, d15, d4
-
- ; step2[5] * cospi_16_64
- vmull.s16 q12, d9, d4
- vmull.s16 q11, d8, d4
-
- ; temp1 = (step2[6] - step2[5]) * cospi_16_64
- vsub.s32 q15, q10, q12
- vsub.s32 q6, q9, q11
-
- ; temp2 = (step2[5] + step2[6]) * cospi_16_64
- vadd.s32 q9, q9, q11
- vadd.s32 q10, q10, q12
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d11, q15, #14 ; >> 14
- vqrshrn.s32 d10, q6, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q10, #14 ; >> 14
-
- ; stage 6
- vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7];
- vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5];
- vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4];
- vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6];
- vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4];
- vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5];
- vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6];
- vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7];
-
- ; store the data
- vst1.64 {d4}, [r1], r2
- vst1.64 {d5}, [r1], r2
- vst1.64 {d18}, [r1], r2
- vst1.64 {d19}, [r1], r2
- vst1.64 {d20}, [r1], r2
- vst1.64 {d21}, [r1], r2
- vst1.64 {d22}, [r1], r2
- vst1.64 {d23}, [r1], r2
- vst1.64 {d24}, [r1], r2
- vst1.64 {d25}, [r1], r2
- vst1.64 {d26}, [r1], r2
- vst1.64 {d27}, [r1], r2
- vst1.64 {d28}, [r1], r2
- vst1.64 {d29}, [r1], r2
- vst1.64 {d30}, [r1], r2
- vst1.64 {d31}, [r1], r2
-
- bx lr
- ENDP ; |vp9_idct16x16_10_add_neon_pass1|
-
-;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
-; int16_t *output,
-; int16_t *pass1Output,
-; int16_t skip_adding,
-; uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t *src
-; r1 int16_t *output,
-; r2 int16_t *pass1Output,
-; r3 int16_t skip_adding,
-; r4 uint8_t *dest,
-; r5 int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vp9_idct16x16_10_add_neon_pass2| PROC
- push {r3-r9}
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q0,q1}, [r0]!
- vmov.s16 q15, q0;
-
- ; generate 2*cospi_30_64 = 3212
- mov r3, #0xc00
- add r3, #0x8c
-
- ; generate 2*cospi_2_64 = 32610
- mov r12, #0x7f00
- add r12, #0x62
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 q6, r3 ; duplicate 2*cospi_30_64
-
- ; dct_const_round_shift(step1[8] * cospi_30_64)
- vqrdmulh.s16 q0, q8, q6
-
- vdup.16 q6, r12 ; duplicate 2*cospi_2_64
-
- ; dct_const_round_shift(step1[8] * cospi_2_64)
- vqrdmulh.s16 q7, q8, q6
-
- ; preloading to avoid stall
- ; generate 2*cospi_26_64 = 9512
- mov r12, #0x2500
- add r12, #0x28
- rsb r12, #0
- vdup.16 q15, r12 ; duplicate -2*cospi_26_64
-
- ; generate 2*cospi_6_64 = 31358
- mov r3, #0x7a00
- add r3, #0x7e
- vdup.16 q14, r3 ; duplicate 2*cospi_6_64
-
- ; dct_const_round_shift(- step1[12] * cospi_26_64)
- vqrdmulh.s16 q3, q9, q15
-
- ; dct_const_round_shift(step1[12] * cospi_6_64)
- vqrdmulh.s16 q4, q9, q14
-
- ; stage 4
- ; generate cospi_24_64 = 6270
- mov r3, #0x1800
- add r3, #0x7e
- vdup.16 d31, r3 ; duplicate cospi_24_64
-
- ; generate cospi_8_64 = 15137
- mov r12, #0x3b00
- add r12, #0x21
- vdup.16 d30, r12 ; duplicate cospi_8_64
-
- ; step1[14] * cospi_24_64
- vmull.s16 q12, d14, d31
- vmull.s16 q5, d15, d31
-
- ; step1[9] * cospi_24_64
- vmull.s16 q2, d0, d31
- vmull.s16 q11, d1, d31
-
- ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
- vmlsl.s16 q12, d0, d30
- vmlsl.s16 q5, d1, d30
-
- ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
- vmlal.s16 q2, d14, d30
- vmlal.s16 q11, d15, d30
-
- rsb r12, #0
- vdup.16 d30, r12 ; duplicate -cospi_8_64
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d2, q12, #14 ; >> 14
- vqrshrn.s32 d3, q5, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q2, #14 ; >> 14
- vqrshrn.s32 d13, q11, #14 ; >> 14
-
- ; - step1[13] * cospi_8_64
- vmull.s16 q10, d8, d30
- vmull.s16 q13, d9, d30
-
- ; -step1[10] * cospi_8_64
- vmull.s16 q8, d6, d30
- vmull.s16 q9, d7, d30
-
- ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
- vmlsl.s16 q10, d6, d31
- vmlsl.s16 q13, d7, d31
-
- ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
- vmlal.s16 q8, d8, d31
- vmlal.s16 q9, d9, d31
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q10, #14 ; >> 14
- vqrshrn.s32 d5, q13, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d10, q8, #14 ; >> 14
- vqrshrn.s32 d11, q9, #14 ; >> 14
-
- ; stage 5
- vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
- vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
- vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
- vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
- vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
- vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
- vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
- vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
-
- ; stage 6.
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- vdup.16 d14, r12 ; duplicate cospi_16_64
-
- ; step1[13] * cospi_16_64
- vmull.s16 q3, d26, d14
- vmull.s16 q4, d27, d14
-
- ; step1[10] * cospi_16_64
- vmull.s16 q0, d20, d14
- vmull.s16 q1, d21, d14
-
- ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
- vsub.s32 q5, q3, q0
- vsub.s32 q6, q4, q1
-
- ; temp2 = (step1[10] + step1[13]) * cospi_16_64
- vadd.s32 q0, q3, q0
- vadd.s32 q1, q4, q1
-
- ; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q5, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vqrshrn.s32 d10, q0, #14 ; >> 14
- vqrshrn.s32 d11, q1, #14 ; >> 14
-
- ; step1[11] * cospi_16_64
- vmull.s16 q0, d22, d14
- vmull.s16 q1, d23, d14
-
- ; step1[12] * cospi_16_64
- vmull.s16 q13, d24, d14
- vmull.s16 q6, d25, d14
-
- ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
- vsub.s32 q10, q13, q0
- vsub.s32 q4, q6, q1
-
- ; temp2 = (step1[11] + step1[12]) * cospi_16_64
- vadd.s32 q13, q13, q0
- vadd.s32 q6, q6, q1
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d6, q10, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
-
- ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
- vqrshrn.s32 d8, q13, #14 ; >> 14
- vqrshrn.s32 d9, q6, #14 ; >> 14
-
- mov r4, #16 ; pass1Output stride
- ldr r3, [sp] ; load skip_adding
-
- ; stage 7
- ; load the data in pass1
- mov r5, #24
- mov r3, #8
-
- vld1.s16 {q0}, [r2], r4 ; load data step2[0]
- vld1.s16 {q1}, [r2], r4 ; load data step2[1]
- vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
- vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
- vld1.s16 {q10}, [r2], r4 ; load data step2[2]
- vld1.s16 {q11}, [r2], r4 ; load data step2[3]
- vst1.64 {d24}, [r1], r3 ; store output[0]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[1]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
- vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
- vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
- vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
- vst1.64 {d24}, [r1], r3 ; store output[2]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[3]
- vst1.64 {d27}, [r1], r5
- vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
- vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
- vld1.s16 {q0}, [r2], r4 ; load data step2[4]
- vld1.s16 {q1}, [r2], r4 ; load data step2[5]
- vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
- vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
- vld1.s16 {q10}, [r2], r4 ; load data step2[6]
- vld1.s16 {q11}, [r2], r4 ; load data step2[7]
- vst1.64 {d24}, [r1], r3 ; store output[4]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[5]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
- vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
- vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
- vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
- vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
- vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
- vst1.64 {d24}, [r1], r3 ; store output[6]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[7]
- vst1.64 {d27}, [r1], r5
-
- ; store the data output 8,9,10,11,12,13,14,15
- vst1.64 {d16}, [r1], r3
- vst1.64 {d17}, [r1], r5
- vst1.64 {d18}, [r1], r3
- vst1.64 {d19}, [r1], r5
- vst1.64 {d4}, [r1], r3
- vst1.64 {d5}, [r1], r5
- vst1.64 {d6}, [r1], r3
- vst1.64 {d7}, [r1], r5
- vst1.64 {d8}, [r1], r3
- vst1.64 {d9}, [r1], r5
- vst1.64 {d10}, [r1], r3
- vst1.64 {d11}, [r1], r5
- vst1.64 {d28}, [r1], r3
- vst1.64 {d29}, [r1], r5
- vst1.64 {d30}, [r1], r3
- vst1.64 {d31}, [r1], r5
-end_idct10_16x16_pass2
- pop {r3-r9}
- bx lr
- ENDP ; |vp9_idct16x16_10_add_neon_pass2|
- END
--- a/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm
+++ /dev/null
@@ -1,144 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
- EXPORT |vp9_idct32x32_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ;TODO(hkuang): put the following macros in a seperate
- ;file so other idct function could also use them.
- MACRO
- LD_16x8 $src, $stride
- vld1.8 {q8}, [$src], $stride
- vld1.8 {q9}, [$src], $stride
- vld1.8 {q10}, [$src], $stride
- vld1.8 {q11}, [$src], $stride
- vld1.8 {q12}, [$src], $stride
- vld1.8 {q13}, [$src], $stride
- vld1.8 {q14}, [$src], $stride
- vld1.8 {q15}, [$src], $stride
- MEND
-
- MACRO
- ADD_DIFF_16x8 $diff
- vqadd.u8 q8, q8, $diff
- vqadd.u8 q9, q9, $diff
- vqadd.u8 q10, q10, $diff
- vqadd.u8 q11, q11, $diff
- vqadd.u8 q12, q12, $diff
- vqadd.u8 q13, q13, $diff
- vqadd.u8 q14, q14, $diff
- vqadd.u8 q15, q15, $diff
- MEND
-
- MACRO
- SUB_DIFF_16x8 $diff
- vqsub.u8 q8, q8, $diff
- vqsub.u8 q9, q9, $diff
- vqsub.u8 q10, q10, $diff
- vqsub.u8 q11, q11, $diff
- vqsub.u8 q12, q12, $diff
- vqsub.u8 q13, q13, $diff
- vqsub.u8 q14, q14, $diff
- vqsub.u8 q15, q15, $diff
- MEND
-
- MACRO
- ST_16x8 $dst, $stride
- vst1.8 {q8}, [$dst], $stride
- vst1.8 {q9}, [$dst], $stride
- vst1.8 {q10},[$dst], $stride
- vst1.8 {q11},[$dst], $stride
- vst1.8 {q12},[$dst], $stride
- vst1.8 {q13},[$dst], $stride
- vst1.8 {q14},[$dst], $stride
- vst1.8 {q15},[$dst], $stride
- MEND
-
-;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-
-|vp9_idct32x32_1_add_neon| PROC
- push {lr}
- pld [r1]
- add r3, r1, #16 ; r3 dest + 16 for second loop
- ldrsh r0, [r0]
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 6)
- add r0, r0, #32 ; + (1 <<((6) - 1))
- asrs r0, r0, #6 ; >> 6
- bge diff_positive_32_32
-
-diff_negative_32_32
- neg r0, r0
- usat r0, #8, r0
- vdup.u8 q0, r0
- mov r0, #4
-
-diff_negative_32_32_loop
- sub r0, #1
- LD_16x8 r1, r2
- SUB_DIFF_16x8 q0
- ST_16x8 r12, r2
-
- LD_16x8 r1, r2
- SUB_DIFF_16x8 q0
- ST_16x8 r12, r2
- cmp r0, #2
- moveq r1, r3
- moveq r12, r3
- cmp r0, #0
- bne diff_negative_32_32_loop
- pop {pc}
-
-diff_positive_32_32
- usat r0, #8, r0
- vdup.u8 q0, r0
- mov r0, #4
-
-diff_positive_32_32_loop
- sub r0, #1
- LD_16x8 r1, r2
- ADD_DIFF_16x8 q0
- ST_16x8 r12, r2
-
- LD_16x8 r1, r2
- ADD_DIFF_16x8 q0
- ST_16x8 r12, r2
- cmp r0, #2
- moveq r1, r3
- moveq r12, r3
- cmp r0, #0
- bne diff_positive_32_32_loop
- pop {pc}
-
- ENDP ; |vp9_idct32x32_1_add_neon|
- END
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ /dev/null
@@ -1,1299 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-;TODO(cd): adjust these constant to be able to use vqdmulh for faster
-; dct_const_round_shift(a * b) within butterfly calculations.
-cospi_1_64 EQU 16364
-cospi_2_64 EQU 16305
-cospi_3_64 EQU 16207
-cospi_4_64 EQU 16069
-cospi_5_64 EQU 15893
-cospi_6_64 EQU 15679
-cospi_7_64 EQU 15426
-cospi_8_64 EQU 15137
-cospi_9_64 EQU 14811
-cospi_10_64 EQU 14449
-cospi_11_64 EQU 14053
-cospi_12_64 EQU 13623
-cospi_13_64 EQU 13160
-cospi_14_64 EQU 12665
-cospi_15_64 EQU 12140
-cospi_16_64 EQU 11585
-cospi_17_64 EQU 11003
-cospi_18_64 EQU 10394
-cospi_19_64 EQU 9760
-cospi_20_64 EQU 9102
-cospi_21_64 EQU 8423
-cospi_22_64 EQU 7723
-cospi_23_64 EQU 7005
-cospi_24_64 EQU 6270
-cospi_25_64 EQU 5520
-cospi_26_64 EQU 4756
-cospi_27_64 EQU 3981
-cospi_28_64 EQU 3196
-cospi_29_64 EQU 2404
-cospi_30_64 EQU 1606
-cospi_31_64 EQU 804
-
-
- EXPORT |vp9_idct32x32_1024_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- AREA Block, CODE, READONLY
-
- ; --------------------------------------------------------------------------
- ; Load from transposed_buffer
- ; q13 = transposed_buffer[first_offset]
- ; q14 = transposed_buffer[second_offset]
- ; for proper address calculation, the last offset used when manipulating
- ; transposed_buffer must be passed in. use 0 for first use.
- MACRO
- LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
- ; address calculation with proper stride and loading
- add r0, #($first_offset - $prev_offset )*8*2
- vld1.s16 {q14}, [r0]
- add r0, #($second_offset - $first_offset)*8*2
- vld1.s16 {q13}, [r0]
- ; (used) two registers (q14, q13)
- MEND
- ; --------------------------------------------------------------------------
- ; Load from output (used as temporary storage)
- ; reg1 = output[first_offset]
- ; reg2 = output[second_offset]
- ; for proper address calculation, the last offset used when manipulating
- ; output, wethere reading or storing) must be passed in. use 0 for first
- ; use.
- MACRO
- LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
- ; address calculation with proper stride and loading
- add r1, #($first_offset - $prev_offset )*32*2
- vld1.s16 {$reg1}, [r1]
- add r1, #($second_offset - $first_offset)*32*2
- vld1.s16 {$reg2}, [r1]
- ; (used) two registers ($reg1, $reg2)
- MEND
- ; --------------------------------------------------------------------------
- ; Store into output (sometimes as as temporary storage)
- ; output[first_offset] = reg1
- ; output[second_offset] = reg2
- ; for proper address calculation, the last offset used when manipulating
- ; output, wethere reading or storing) must be passed in. use 0 for first
- ; use.
- MACRO
- STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
- ; address calculation with proper stride and storing
- add r1, #($first_offset - $prev_offset )*32*2
- vst1.16 {$reg1}, [r1]
- add r1, #($second_offset - $first_offset)*32*2
- vst1.16 {$reg2}, [r1]
- MEND
- ; --------------------------------------------------------------------------
- ; Combine-add results with current destination content
- ; q6-q9 contain the results (out[j * 32 + 0-31])
- MACRO
- STORE_COMBINE_CENTER_RESULTS
- ; load dest[j * dest_stride + 0-31]
- vld1.s16 {d8}, [r10], r2
- vld1.s16 {d11}, [r9], r11
- vld1.s16 {d9}, [r10]
- vld1.s16 {d10}, [r9]
- ; ROUND_POWER_OF_TWO
- vrshr.s16 q7, q7, #6
- vrshr.s16 q8, q8, #6
- vrshr.s16 q9, q9, #6
- vrshr.s16 q6, q6, #6
- ; add to dest[j * dest_stride + 0-31]
- vaddw.u8 q7, q7, d9
- vaddw.u8 q8, q8, d10
- vaddw.u8 q9, q9, d11
- vaddw.u8 q6, q6, d8
- ; clip pixel
- vqmovun.s16 d9, q7
- vqmovun.s16 d10, q8
- vqmovun.s16 d11, q9
- vqmovun.s16 d8, q6
- ; store back into dest[j * dest_stride + 0-31]
- vst1.16 {d9}, [r10], r11
- vst1.16 {d10}, [r9], r2
- vst1.16 {d8}, [r10]
- vst1.16 {d11}, [r9]
- ; update pointers (by dest_stride * 2)
- sub r9, r9, r2, lsl #1
- add r10, r10, r2, lsl #1
- MEND
- ; --------------------------------------------------------------------------
- ; Combine-add results with current destination content
- ; q6-q9 contain the results (out[j * 32 + 0-31])
- MACRO
- STORE_COMBINE_CENTER_RESULTS_LAST
- ; load dest[j * dest_stride + 0-31]
- vld1.s16 {d8}, [r10], r2
- vld1.s16 {d11}, [r9], r11
- vld1.s16 {d9}, [r10]
- vld1.s16 {d10}, [r9]
- ; ROUND_POWER_OF_TWO
- vrshr.s16 q7, q7, #6
- vrshr.s16 q8, q8, #6
- vrshr.s16 q9, q9, #6
- vrshr.s16 q6, q6, #6
- ; add to dest[j * dest_stride + 0-31]
- vaddw.u8 q7, q7, d9
- vaddw.u8 q8, q8, d10
- vaddw.u8 q9, q9, d11
- vaddw.u8 q6, q6, d8
- ; clip pixel
- vqmovun.s16 d9, q7
- vqmovun.s16 d10, q8
- vqmovun.s16 d11, q9
- vqmovun.s16 d8, q6
- ; store back into dest[j * dest_stride + 0-31]
- vst1.16 {d9}, [r10], r11
- vst1.16 {d10}, [r9], r2
- vst1.16 {d8}, [r10]!
- vst1.16 {d11}, [r9]!
- ; update pointers (by dest_stride * 2)
- sub r9, r9, r2, lsl #1
- add r10, r10, r2, lsl #1
- MEND
- ; --------------------------------------------------------------------------
- ; Combine-add results with current destination content
- ; q4-q7 contain the results (out[j * 32 + 0-31])
- MACRO
- STORE_COMBINE_EXTREME_RESULTS
- ; load dest[j * dest_stride + 0-31]
- vld1.s16 {d4}, [r7], r2
- vld1.s16 {d7}, [r6], r11
- vld1.s16 {d5}, [r7]
- vld1.s16 {d6}, [r6]
- ; ROUND_POWER_OF_TWO
- vrshr.s16 q5, q5, #6
- vrshr.s16 q6, q6, #6
- vrshr.s16 q7, q7, #6
- vrshr.s16 q4, q4, #6
- ; add to dest[j * dest_stride + 0-31]
- vaddw.u8 q5, q5, d5
- vaddw.u8 q6, q6, d6
- vaddw.u8 q7, q7, d7
- vaddw.u8 q4, q4, d4
- ; clip pixel
- vqmovun.s16 d5, q5
- vqmovun.s16 d6, q6
- vqmovun.s16 d7, q7
- vqmovun.s16 d4, q4
- ; store back into dest[j * dest_stride + 0-31]
- vst1.16 {d5}, [r7], r11
- vst1.16 {d6}, [r6], r2
- vst1.16 {d7}, [r6]
- vst1.16 {d4}, [r7]
- ; update pointers (by dest_stride * 2)
- sub r6, r6, r2, lsl #1
- add r7, r7, r2, lsl #1
- MEND
- ; --------------------------------------------------------------------------
- ; Combine-add results with current destination content
- ; q4-q7 contain the results (out[j * 32 + 0-31])
- MACRO
- STORE_COMBINE_EXTREME_RESULTS_LAST
- ; load dest[j * dest_stride + 0-31]
- vld1.s16 {d4}, [r7], r2
- vld1.s16 {d7}, [r6], r11
- vld1.s16 {d5}, [r7]
- vld1.s16 {d6}, [r6]
- ; ROUND_POWER_OF_TWO
- vrshr.s16 q5, q5, #6
- vrshr.s16 q6, q6, #6
- vrshr.s16 q7, q7, #6
- vrshr.s16 q4, q4, #6
- ; add to dest[j * dest_stride + 0-31]
- vaddw.u8 q5, q5, d5
- vaddw.u8 q6, q6, d6
- vaddw.u8 q7, q7, d7
- vaddw.u8 q4, q4, d4
- ; clip pixel
- vqmovun.s16 d5, q5
- vqmovun.s16 d6, q6
- vqmovun.s16 d7, q7
- vqmovun.s16 d4, q4
- ; store back into dest[j * dest_stride + 0-31]
- vst1.16 {d5}, [r7], r11
- vst1.16 {d6}, [r6], r2
- vst1.16 {d7}, [r6]!
- vst1.16 {d4}, [r7]!
- ; update pointers (by dest_stride * 2)
- sub r6, r6, r2, lsl #1
- add r7, r7, r2, lsl #1
- MEND
- ; --------------------------------------------------------------------------
- ; Touches q8-q12, q15 (q13-q14 are preserved)
- ; valid output registers are anything but q8-q11
- MACRO
- DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
- ; TODO(cd): have special case to re-use constants when they are similar for
- ; consecutive butterflies
- ; TODO(cd): have special case when both constants are the same, do the
- ; additions/substractions before the multiplies.
- ; generate the constants
- ; generate scalar constants
- mov r8, #$first_constant & 0xFF00
- mov r12, #$second_constant & 0xFF00
- add r8, #$first_constant & 0x00FF
- add r12, #$second_constant & 0x00FF
- ; generate vector constants
- vdup.16 d30, r8
- vdup.16 d31, r12
- ; (used) two for inputs (regA-regD), one for constants (q15)
- ; do some multiplications (ordered for maximum latency hiding)
- vmull.s16 q8, $regC, d30
- vmull.s16 q10, $regA, d31
- vmull.s16 q9, $regD, d30
- vmull.s16 q11, $regB, d31
- vmull.s16 q12, $regC, d31
- ; (used) five for intermediate (q8-q12), one for constants (q15)
- ; do some addition/substractions (to get back two register)
- vsub.s32 q8, q8, q10
- vsub.s32 q9, q9, q11
- ; do more multiplications (ordered for maximum latency hiding)
- vmull.s16 q10, $regD, d31
- vmull.s16 q11, $regA, d30
- vmull.s16 q15, $regB, d30
- ; (used) six for intermediate (q8-q12, q15)
- ; do more addition/substractions
- vadd.s32 q11, q12, q11
- vadd.s32 q10, q10, q15
- ; (used) four for intermediate (q8-q11)
- ; dct_const_round_shift
- vqrshrn.s32 $reg1, q8, #14
- vqrshrn.s32 $reg2, q9, #14
- vqrshrn.s32 $reg3, q11, #14
- vqrshrn.s32 $reg4, q10, #14
- ; (used) two for results, well four d registers
- MEND
- ; --------------------------------------------------------------------------
- ; Touches q8-q12, q15 (q13-q14 are preserved)
- ; valid output registers are anything but q8-q11
- MACRO
- DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
- DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
- MEND
- ; --------------------------------------------------------------------------
-
-;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
-;
-; r0 int16_t *input,
-; r1 uint8_t *dest,
-; r2 int dest_stride)
-; loop counters
-; r4 bands loop counter
-; r5 pass loop counter
-; r8 transpose loop counter
-; combine-add pointers
-; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
-; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
-; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
-; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
-
-|vp9_idct32x32_1024_add_neon| PROC
- ; This function does one pass of idct32x32 transform.
- ;
- ; This is done by transposing the input and then doing a 1d transform on
- ; columns. In the first pass, the transposed columns are the original
- ; rows. In the second pass, after the transposition, the colums are the
- ; original columns.
- ; The 1d transform is done by looping over bands of eight columns (the
- ; idct32_bands loop). For each band, the transform input transposition
- ; is done on demand, one band of four 8x8 matrices at a time. The four
- ; matrices are transposed by pairs (the idct32_transpose_pair loop).
- push {r4-r11}
- vpush {d8-d15}
- ; stack operation
- ; internal buffer used to transpose 8 lines into before transforming them
- ; int16_t transpose_buffer[32 * 8];
- ; at sp + [4096, 4607]
- ; results of the first pass (transpose and transform rows)
- ; int16_t pass1[32 * 32];
- ; at sp + [0, 2047]
- ; results of the second pass (transpose and transform columns)
- ; int16_t pass2[32 * 32];
- ; at sp + [2048, 4095]
- sub sp, sp, #512+2048+2048
-
- ; r6 = dest + 31 * dest_stride
- ; r7 = dest + 0 * dest_stride
- ; r9 = dest + 15 * dest_stride
- ; r10 = dest + 16 * dest_stride
- rsb r6, r2, r2, lsl #5
- rsb r9, r2, r2, lsl #4
- add r10, r1, r2, lsl #4
- mov r7, r1
- add r6, r6, r1
- add r9, r9, r1
- ; r11 = -dest_stride
- neg r11, r2
- ; r3 = input
- mov r3, r0
- ; parameters for first pass
- ; r0 = transpose_buffer[32 * 8]
- add r0, sp, #4096
- ; r1 = pass1[32 * 32]
- mov r1, sp
-
- mov r5, #0 ; initialize pass loop counter
-idct32_pass_loop
- mov r4, #4 ; initialize bands loop counter
-idct32_bands_loop
- mov r8, #2 ; initialize transpose loop counter
-idct32_transpose_pair_loop
- ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
- ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
- ; adjusted to 32 because of the two post-increments.
- vld1.s16 {q8}, [r3]!
- vld1.s16 {q0}, [r3]!
- add r3, #32
- vld1.s16 {q9}, [r3]!
- vld1.s16 {q1}, [r3]!
- add r3, #32
- vld1.s16 {q10}, [r3]!
- vld1.s16 {q2}, [r3]!
- add r3, #32
- vld1.s16 {q11}, [r3]!
- vld1.s16 {q3}, [r3]!
- add r3, #32
- vld1.s16 {q12}, [r3]!
- vld1.s16 {q4}, [r3]!
- add r3, #32
- vld1.s16 {q13}, [r3]!
- vld1.s16 {q5}, [r3]!
- add r3, #32
- vld1.s16 {q14}, [r3]!
- vld1.s16 {q6}, [r3]!
- add r3, #32
- vld1.s16 {q15}, [r3]!
- vld1.s16 {q7}, [r3]!
-
- ; Transpose the two 8x8 16bit data matrices.
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vswp d1, d8
- vswp d7, d14
- vswp d5, d12
- vswp d3, d10
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.32 q0, q2
- vtrn.32 q1, q3
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; Store both matrices after each other. There is a stride of 32, which
- ; adjusts to nothing because of the post-increments.
- vst1.16 {q8}, [r0]!
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
- vst1.16 {q0}, [r0]!
- vst1.16 {q1}, [r0]!
- vst1.16 {q2}, [r0]!
- vst1.16 {q3}, [r0]!
- vst1.16 {q4}, [r0]!
- vst1.16 {q5}, [r0]!
- vst1.16 {q6}, [r0]!
- vst1.16 {q7}, [r0]!
-
- ; increment pointers by adjusted stride (not necessary for r0/out)
- ; go back by 7*32 for the seven lines moved fully by read and add
- ; go back by 32 for the eigth line only read
- ; advance by 16*2 to go the next pair
- sub r3, r3, #7*32*2 + 32 - 16*2
- ; transpose pair loop processing
- subs r8, r8, #1
- bne idct32_transpose_pair_loop
-
- ; restore r0/input to its original value
- sub r0, r0, #32*8*2
-
- ; Instead of doing the transforms stage by stage, it is done by loading
- ; some input values and doing as many stages as possible to minimize the
- ; storing/loading of intermediate results. To fit within registers, the
- ; final coefficients are cut into four blocks:
- ; BLOCK A: 16-19,28-31
- ; BLOCK B: 20-23,24-27
- ; BLOCK C: 8-10,11-15
- ; BLOCK D: 0-3,4-7
- ; Blocks A and C are straight calculation through the various stages. In
- ; block B, further calculations are performed using the results from
- ; block A. In block D, further calculations are performed using the results
- ; from block C and then the final calculations are done using results from
- ; block A and B which have been combined at the end of block B.
-
- ; --------------------------------------------------------------------------
- ; BLOCK A: 16-19,28-31
- ; --------------------------------------------------------------------------
- ; generate 16,17,30,31
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
- ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
- ;step1b[16][i] = dct_const_round_shift(temp1);
- ;step1b[31][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 0, 1, 31
- DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
- ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
- ;step1b[17][i] = dct_const_round_shift(temp1);
- ;step1b[30][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 31, 17, 15
- DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;step2[16] = step1b[16][i] + step1b[17][i];
- ;step2[17] = step1b[16][i] - step1b[17][i];
- ;step2[30] = -step1b[30][i] + step1b[31][i];
- ;step2[31] = step1b[30][i] + step1b[31][i];
- vadd.s16 q4, q0, q1
- vsub.s16 q13, q0, q1
- vadd.s16 q6, q2, q3
- vsub.s16 q14, q2, q3
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
- ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
- ;step3[17] = dct_const_round_shift(temp1);
- ;step3[30] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
- ; --------------------------------------------------------------------------
- ; generate 18,19,28,29
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
- ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
- ;step1b[18][i] = dct_const_round_shift(temp1);
- ;step1b[29][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 15, 9, 23
- DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
- ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
- ;step1b[19][i] = dct_const_round_shift(temp1);
- ;step1b[28][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 23, 25, 7
- DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;step2[18] = -step1b[18][i] + step1b[19][i];
- ;step2[19] = step1b[18][i] + step1b[19][i];
- ;step2[28] = step1b[28][i] + step1b[29][i];
- ;step2[29] = step1b[28][i] - step1b[29][i];
- vsub.s16 q13, q3, q2
- vadd.s16 q3, q3, q2
- vsub.s16 q14, q1, q0
- vadd.s16 q2, q1, q0
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
- ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
- ;step3[29] = dct_const_round_shift(temp1);
- ;step3[18] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
- ; --------------------------------------------------------------------------
- ; combine 16-19,28-31
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;step1[16] = step1b[16][i] + step1b[19][i];
- ;step1[17] = step1b[17][i] + step1b[18][i];
- ;step1[18] = step1b[17][i] - step1b[18][i];
- ;step1[29] = step1b[30][i] - step1b[29][i];
- ;step1[30] = step1b[30][i] + step1b[29][i];
- ;step1[31] = step1b[31][i] + step1b[28][i];
- vadd.s16 q8, q4, q2
- vadd.s16 q9, q5, q0
- vadd.s16 q10, q7, q1
- vadd.s16 q15, q6, q3
- vsub.s16 q13, q5, q0
- vsub.s16 q14, q7, q1
- STORE_IN_OUTPUT 0, 16, 31, q8, q15
- STORE_IN_OUTPUT 31, 17, 30, q9, q10
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
- ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
- ;step2[18] = dct_const_round_shift(temp1);
- ;step2[29] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
- STORE_IN_OUTPUT 30, 29, 18, q1, q0
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;step1[19] = step1b[16][i] - step1b[19][i];
- ;step1[28] = step1b[31][i] - step1b[28][i];
- vsub.s16 q13, q4, q2
- vsub.s16 q14, q6, q3
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
- ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
- ;step2[19] = dct_const_round_shift(temp1);
- ;step2[28] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
- STORE_IN_OUTPUT 18, 19, 28, q4, q6
- ; --------------------------------------------------------------------------
-
-
- ; --------------------------------------------------------------------------
- ; BLOCK B: 20-23,24-27
- ; --------------------------------------------------------------------------
- ; generate 20,21,26,27
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
- ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
- ;step1b[20][i] = dct_const_round_shift(temp1);
- ;step1b[27][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 7, 5, 27
- DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
- ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
- ;step1b[21][i] = dct_const_round_shift(temp1);
- ;step1b[26][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 27, 21, 11
- DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;step2[20] = step1b[20][i] + step1b[21][i];
- ;step2[21] = step1b[20][i] - step1b[21][i];
- ;step2[26] = -step1b[26][i] + step1b[27][i];
- ;step2[27] = step1b[26][i] + step1b[27][i];
- vsub.s16 q13, q0, q1
- vadd.s16 q0, q0, q1
- vsub.s16 q14, q2, q3
- vadd.s16 q2, q2, q3
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
- ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
- ;step3[21] = dct_const_round_shift(temp1);
- ;step3[26] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; generate 22,23,24,25
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
- ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
- ;step1b[22][i] = dct_const_round_shift(temp1);
- ;step1b[25][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 11, 13, 19
- DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
- ; --------------------------------------------------------------------------
- ; part of stage 1
- ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
- ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
- ;step1b[23][i] = dct_const_round_shift(temp1);
- ;step1b[24][i] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 19, 29, 3
- DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;step2[22] = -step1b[22][i] + step1b[23][i];
- ;step2[23] = step1b[22][i] + step1b[23][i];
- ;step2[24] = step1b[24][i] + step1b[25][i];
- ;step2[25] = step1b[24][i] - step1b[25][i];
- vsub.s16 q14, q4, q5
- vadd.s16 q5, q4, q5
- vsub.s16 q13, q6, q7
- vadd.s16 q6, q6, q7
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
- ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
- ;step3[25] = dct_const_round_shift(temp1);
- ;step3[22] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
- ; --------------------------------------------------------------------------
- ; combine 20-23,24-27
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;step1[22] = step1b[22][i] + step1b[21][i];
- ;step1[23] = step1b[23][i] + step1b[20][i];
- vadd.s16 q10, q7, q1
- vadd.s16 q11, q5, q0
- ;step1[24] = step1b[24][i] + step1b[27][i];
- ;step1[25] = step1b[25][i] + step1b[26][i];
- vadd.s16 q12, q6, q2
- vadd.s16 q15, q4, q3
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;step3[16] = step1b[16][i] + step1b[23][i];
- ;step3[17] = step1b[17][i] + step1b[22][i];
- ;step3[22] = step1b[17][i] - step1b[22][i];
- ;step3[23] = step1b[16][i] - step1b[23][i];
- LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
- vadd.s16 q8, q14, q11
- vadd.s16 q9, q13, q10
- vsub.s16 q13, q13, q10
- vsub.s16 q11, q14, q11
- STORE_IN_OUTPUT 17, 17, 16, q9, q8
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;step3[24] = step1b[31][i] - step1b[24][i];
- ;step3[25] = step1b[30][i] - step1b[25][i];
- ;step3[30] = step1b[30][i] + step1b[25][i];
- ;step3[31] = step1b[31][i] + step1b[24][i];
- LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
- vsub.s16 q8, q9, q12
- vadd.s16 q10, q14, q15
- vsub.s16 q14, q14, q15
- vadd.s16 q12, q9, q12
- STORE_IN_OUTPUT 31, 30, 31, q10, q12
- ; --------------------------------------------------------------------------
- ; TODO(cd) do some register allocation change to remove these push/pop
- vpush {q8} ; [24]
- vpush {q11} ; [23]
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
- ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
- ;step1[22] = dct_const_round_shift(temp1);
- ;step1[25] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
- STORE_IN_OUTPUT 31, 25, 22, q14, q13
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
- ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
- ;step1[23] = dct_const_round_shift(temp1);
- ;step1[24] = dct_const_round_shift(temp2);
- ; TODO(cd) do some register allocation change to remove these push/pop
- vpop {q13} ; [23]
- vpop {q14} ; [24]
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
- STORE_IN_OUTPUT 22, 24, 23, q14, q13
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;step1[20] = step1b[23][i] - step1b[20][i];
- ;step1[27] = step1b[24][i] - step1b[27][i];
- vsub.s16 q14, q5, q0
- vsub.s16 q13, q6, q2
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
- ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
- ;step2[27] = dct_const_round_shift(temp1);
- ;step2[20] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;step1[21] = step1b[22][i] - step1b[21][i];
- ;step1[26] = step1b[25][i] - step1b[26][i];
- vsub.s16 q14, q7, q1
- vsub.s16 q13, q4, q3
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
- ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
- ;step2[26] = dct_const_round_shift(temp1);
- ;step2[21] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;step3[18] = step1b[18][i] + step1b[21][i];
- ;step3[19] = step1b[19][i] + step1b[20][i];
- ;step3[20] = step1b[19][i] - step1b[20][i];
- ;step3[21] = step1b[18][i] - step1b[21][i];
- LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
- vadd.s16 q8, q14, q1
- vadd.s16 q9, q13, q6
- vsub.s16 q13, q13, q6
- vsub.s16 q1, q14, q1
- STORE_IN_OUTPUT 19, 18, 19, q8, q9
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;step3[27] = step1b[28][i] - step1b[27][i];
- ;step3[28] = step1b[28][i] + step1b[27][i];
- ;step3[29] = step1b[29][i] + step1b[26][i];
- ;step3[26] = step1b[29][i] - step1b[26][i];
- LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
- vsub.s16 q14, q8, q5
- vadd.s16 q10, q8, q5
- vadd.s16 q11, q9, q0
- vsub.s16 q0, q9, q0
- STORE_IN_OUTPUT 29, 28, 29, q10, q11
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
- ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
- ;step1[20] = dct_const_round_shift(temp1);
- ;step1[27] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
- STORE_IN_OUTPUT 29, 20, 27, q13, q14
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
- ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
- ;step1[21] = dct_const_round_shift(temp1);
- ;step1[26] = dct_const_round_shift(temp2);
- DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
- STORE_IN_OUTPUT 27, 21, 26, q1, q0
- ; --------------------------------------------------------------------------
-
-
- ; --------------------------------------------------------------------------
- ; BLOCK C: 8-10,11-15
- ; --------------------------------------------------------------------------
- ; generate 8,9,14,15
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
- ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
- ;step2[8] = dct_const_round_shift(temp1);
- ;step2[15] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 3, 2, 30
- DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
- ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
- ;step2[9] = dct_const_round_shift(temp1);
- ;step2[14] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 30, 18, 14
- DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;step3[8] = step1b[8][i] + step1b[9][i];
- ;step3[9] = step1b[8][i] - step1b[9][i];
- ;step3[14] = step1b[15][i] - step1b[14][i];
- ;step3[15] = step1b[15][i] + step1b[14][i];
- vsub.s16 q13, q0, q1
- vadd.s16 q0, q0, q1
- vsub.s16 q14, q2, q3
- vadd.s16 q2, q2, q3
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
- ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
- ;step1[9] = dct_const_round_shift(temp1);
- ;step1[14] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; generate 10,11,12,13
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
- ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
- ;step2[10] = dct_const_round_shift(temp1);
- ;step2[13] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 14, 10, 22
- DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
- ; --------------------------------------------------------------------------
- ; part of stage 2
- ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
- ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
- ;step2[11] = dct_const_round_shift(temp1);
- ;step2[12] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 22, 26, 6
- DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;step3[10] = step1b[11][i] - step1b[10][i];
- ;step3[11] = step1b[11][i] + step1b[10][i];
- ;step3[12] = step1b[12][i] + step1b[13][i];
- ;step3[13] = step1b[12][i] - step1b[13][i];
- vsub.s16 q14, q4, q5
- vadd.s16 q5, q4, q5
- vsub.s16 q13, q6, q7
- vadd.s16 q6, q6, q7
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
- ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
- ;step1[13] = dct_const_round_shift(temp1);
- ;step1[10] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
- ; --------------------------------------------------------------------------
- ; combine 8-10,11-15
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;step2[8] = step1b[8][i] + step1b[11][i];
- ;step2[9] = step1b[9][i] + step1b[10][i];
- ;step2[10] = step1b[9][i] - step1b[10][i];
- vadd.s16 q8, q0, q5
- vadd.s16 q9, q1, q7
- vsub.s16 q13, q1, q7
- ;step2[13] = step1b[14][i] - step1b[13][i];
- ;step2[14] = step1b[14][i] + step1b[13][i];
- ;step2[15] = step1b[15][i] + step1b[12][i];
- vsub.s16 q14, q3, q4
- vadd.s16 q10, q3, q4
- vadd.s16 q15, q2, q6
- STORE_IN_OUTPUT 26, 8, 15, q8, q15
- STORE_IN_OUTPUT 15, 9, 14, q9, q10
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
- ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
- ;step3[10] = dct_const_round_shift(temp1);
- ;step3[13] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
- STORE_IN_OUTPUT 14, 13, 10, q3, q1
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;step2[11] = step1b[8][i] - step1b[11][i];
- ;step2[12] = step1b[15][i] - step1b[12][i];
- vsub.s16 q13, q0, q5
- vsub.s16 q14, q2, q6
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
- ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
- ;step3[11] = dct_const_round_shift(temp1);
- ;step3[12] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
- STORE_IN_OUTPUT 10, 11, 12, q1, q3
- ; --------------------------------------------------------------------------
-
-
- ; --------------------------------------------------------------------------
- ; BLOCK D: 0-3,4-7
- ; --------------------------------------------------------------------------
- ; generate 4,5,6,7
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
- ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
- ;step3[4] = dct_const_round_shift(temp1);
- ;step3[7] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 6, 4, 28
- DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
- ; --------------------------------------------------------------------------
- ; part of stage 3
- ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
- ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
- ;step3[5] = dct_const_round_shift(temp1);
- ;step3[6] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 28, 20, 12
- DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;step1[4] = step1b[4][i] + step1b[5][i];
- ;step1[5] = step1b[4][i] - step1b[5][i];
- ;step1[6] = step1b[7][i] - step1b[6][i];
- ;step1[7] = step1b[7][i] + step1b[6][i];
- vsub.s16 q13, q0, q1
- vadd.s16 q0, q0, q1
- vsub.s16 q14, q2, q3
- vadd.s16 q2, q2, q3
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
- ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
- ;step2[5] = dct_const_round_shift(temp1);
- ;step2[6] = dct_const_round_shift(temp2);
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
- ; --------------------------------------------------------------------------
- ; generate 0,1,2,3
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
- ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
- ;step1[1] = dct_const_round_shift(temp1);
- ;step1[0] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 12, 0, 16
- DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
- ; --------------------------------------------------------------------------
- ; part of stage 4
- ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
- ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
- ;step1[2] = dct_const_round_shift(temp1);
- ;step1[3] = dct_const_round_shift(temp2);
- LOAD_FROM_TRANSPOSED 16, 8, 24
- DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
- ; --------------------------------------------------------------------------
- ; part of stage 5
- ;step2[0] = step1b[0][i] + step1b[3][i];
- ;step2[1] = step1b[1][i] + step1b[2][i];
- ;step2[2] = step1b[1][i] - step1b[2][i];
- ;step2[3] = step1b[0][i] - step1b[3][i];
- vadd.s16 q4, q7, q6
- vsub.s16 q7, q7, q6
- vsub.s16 q6, q5, q14
- vadd.s16 q5, q5, q14
- ; --------------------------------------------------------------------------
- ; combine 0-3,4-7
- ; --------------------------------------------------------------------------
- ; part of stage 6
- ;step3[0] = step1b[0][i] + step1b[7][i];
- ;step3[1] = step1b[1][i] + step1b[6][i];
- ;step3[2] = step1b[2][i] + step1b[5][i];
- ;step3[3] = step1b[3][i] + step1b[4][i];
- vadd.s16 q8, q4, q2
- vadd.s16 q9, q5, q3
- vadd.s16 q10, q6, q1
- vadd.s16 q11, q7, q0
- ;step3[4] = step1b[3][i] - step1b[4][i];
- ;step3[5] = step1b[2][i] - step1b[5][i];
- ;step3[6] = step1b[1][i] - step1b[6][i];
- ;step3[7] = step1b[0][i] - step1b[7][i];
- vsub.s16 q12, q7, q0
- vsub.s16 q13, q6, q1
- vsub.s16 q14, q5, q3
- vsub.s16 q15, q4, q2
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[0] = step1b[0][i] + step1b[15][i];
- ;step1[1] = step1b[1][i] + step1b[14][i];
- ;step1[14] = step1b[1][i] - step1b[14][i];
- ;step1[15] = step1b[0][i] - step1b[15][i];
- LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
- vadd.s16 q2, q8, q1
- vadd.s16 q3, q9, q0
- vsub.s16 q4, q9, q0
- vsub.s16 q5, q8, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[14 * 32] = step1b[14][i] + step1b[17][i];
- ;output[15 * 32] = step1b[15][i] + step1b[16][i];
- ;output[16 * 32] = step1b[15][i] - step1b[16][i];
- ;output[17 * 32] = step1b[14][i] - step1b[17][i];
- LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
-
- cmp r5, #0
- bgt idct32_bands_end_2nd_pass
-
-idct32_bands_end_1st_pass
- STORE_IN_OUTPUT 17, 16, 17, q6, q7
- STORE_IN_OUTPUT 17, 14, 15, q8, q9
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
- ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
- ;output[30 * 32] = step1b[1][i] - step1b[30][i];
- ;output[31 * 32] = step1b[0][i] - step1b[31][i];
- LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 31, 30, 31, q6, q7
- STORE_IN_OUTPUT 31, 0, 1, q4, q5
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[2] = step1b[2][i] + step1b[13][i];
- ;step1[3] = step1b[3][i] + step1b[12][i];
- ;step1[12] = step1b[3][i] - step1b[12][i];
- ;step1[13] = step1b[2][i] - step1b[13][i];
- LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
- vadd.s16 q2, q10, q1
- vadd.s16 q3, q11, q0
- vsub.s16 q4, q11, q0
- vsub.s16 q5, q10, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[12 * 32] = step1b[12][i] + step1b[19][i];
- ;output[13 * 32] = step1b[13][i] + step1b[18][i];
- ;output[18 * 32] = step1b[13][i] - step1b[18][i];
- ;output[19 * 32] = step1b[12][i] - step1b[19][i];
- LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
- STORE_IN_OUTPUT 19, 18, 19, q6, q7
- STORE_IN_OUTPUT 19, 12, 13, q8, q9
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
- ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
- ;output[28 * 32] = step1b[3][i] - step1b[28][i];
- ;output[29 * 32] = step1b[2][i] - step1b[29][i];
- LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 29, 28, 29, q6, q7
- STORE_IN_OUTPUT 29, 2, 3, q4, q5
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[4] = step1b[4][i] + step1b[11][i];
- ;step1[5] = step1b[5][i] + step1b[10][i];
- ;step1[10] = step1b[5][i] - step1b[10][i];
- ;step1[11] = step1b[4][i] - step1b[11][i];
- LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
- vadd.s16 q2, q12, q1
- vadd.s16 q3, q13, q0
- vsub.s16 q4, q13, q0
- vsub.s16 q5, q12, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[10 * 32] = step1b[10][i] + step1b[21][i];
- ;output[11 * 32] = step1b[11][i] + step1b[20][i];
- ;output[20 * 32] = step1b[11][i] - step1b[20][i];
- ;output[21 * 32] = step1b[10][i] - step1b[21][i];
- LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
- STORE_IN_OUTPUT 21, 20, 21, q6, q7
- STORE_IN_OUTPUT 21, 10, 11, q8, q9
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
- ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
- ;output[26 * 32] = step1b[5][i] - step1b[26][i];
- ;output[27 * 32] = step1b[4][i] - step1b[27][i];
- LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 27, 26, 27, q6, q7
- STORE_IN_OUTPUT 27, 4, 5, q4, q5
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[6] = step1b[6][i] + step1b[9][i];
- ;step1[7] = step1b[7][i] + step1b[8][i];
- ;step1[8] = step1b[7][i] - step1b[8][i];
- ;step1[9] = step1b[6][i] - step1b[9][i];
- LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
- vadd.s16 q2, q14, q1
- vadd.s16 q3, q15, q0
- vsub.s16 q4, q15, q0
- vsub.s16 q5, q14, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
- ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
- ;output[22 * 32] = step1b[9][i] - step1b[22][i];
- ;output[23 * 32] = step1b[8][i] - step1b[23][i];
- LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
- STORE_IN_OUTPUT 23, 22, 23, q6, q7
- STORE_IN_OUTPUT 23, 8, 9, q8, q9
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
- ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
- ;output[24 * 32] = step1b[7][i] - step1b[24][i];
- ;output[25 * 32] = step1b[6][i] - step1b[25][i];
- LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 25, 24, 25, q6, q7
- STORE_IN_OUTPUT 25, 6, 7, q4, q5
-
- ; restore r0 by removing the last offset from the last
- ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
- sub r0, r0, #24*8*2
- ; restore r1 by removing the last offset from the last
- ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2
- ; advance by 8 columns => 8*2
- sub r1, r1, #7*32*2 - 8*2
- ; advance by 8 lines (8*32*2)
- ; go back by the two pairs from the loop (32*2)
- add r3, r3, #8*32*2 - 32*2
-
- ; bands loop processing
- subs r4, r4, #1
- bne idct32_bands_loop
-
- ; parameters for second pass
- ; the input of pass2 is the result of pass1. we have to remove the offset
- ; of 32 columns induced by the above idct32_bands_loop
- sub r3, r1, #32*2
- ; r1 = pass2[32 * 32]
- add r1, sp, #2048
-
- ; pass loop processing
- add r5, r5, #1
- b idct32_pass_loop
-
-idct32_bands_end_2nd_pass
- STORE_COMBINE_CENTER_RESULTS
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
- ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
- ;output[30 * 32] = step1b[1][i] - step1b[30][i];
- ;output[31 * 32] = step1b[0][i] - step1b[31][i];
- LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_COMBINE_EXTREME_RESULTS
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[2] = step1b[2][i] + step1b[13][i];
- ;step1[3] = step1b[3][i] + step1b[12][i];
- ;step1[12] = step1b[3][i] - step1b[12][i];
- ;step1[13] = step1b[2][i] - step1b[13][i];
- LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
- vadd.s16 q2, q10, q1
- vadd.s16 q3, q11, q0
- vsub.s16 q4, q11, q0
- vsub.s16 q5, q10, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[12 * 32] = step1b[12][i] + step1b[19][i];
- ;output[13 * 32] = step1b[13][i] + step1b[18][i];
- ;output[18 * 32] = step1b[13][i] - step1b[18][i];
- ;output[19 * 32] = step1b[12][i] - step1b[19][i];
- LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
- STORE_COMBINE_CENTER_RESULTS
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
- ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
- ;output[28 * 32] = step1b[3][i] - step1b[28][i];
- ;output[29 * 32] = step1b[2][i] - step1b[29][i];
- LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_COMBINE_EXTREME_RESULTS
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[4] = step1b[4][i] + step1b[11][i];
- ;step1[5] = step1b[5][i] + step1b[10][i];
- ;step1[10] = step1b[5][i] - step1b[10][i];
- ;step1[11] = step1b[4][i] - step1b[11][i];
- LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
- vadd.s16 q2, q12, q1
- vadd.s16 q3, q13, q0
- vsub.s16 q4, q13, q0
- vsub.s16 q5, q12, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[10 * 32] = step1b[10][i] + step1b[21][i];
- ;output[11 * 32] = step1b[11][i] + step1b[20][i];
- ;output[20 * 32] = step1b[11][i] - step1b[20][i];
- ;output[21 * 32] = step1b[10][i] - step1b[21][i];
- LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
- STORE_COMBINE_CENTER_RESULTS
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
- ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
- ;output[26 * 32] = step1b[5][i] - step1b[26][i];
- ;output[27 * 32] = step1b[4][i] - step1b[27][i];
- LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_COMBINE_EXTREME_RESULTS
- ; --------------------------------------------------------------------------
- ; part of stage 7
- ;step1[6] = step1b[6][i] + step1b[9][i];
- ;step1[7] = step1b[7][i] + step1b[8][i];
- ;step1[8] = step1b[7][i] - step1b[8][i];
- ;step1[9] = step1b[6][i] - step1b[9][i];
- LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
- vadd.s16 q2, q14, q1
- vadd.s16 q3, q15, q0
- vsub.s16 q4, q15, q0
- vsub.s16 q5, q14, q1
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
- ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
- ;output[22 * 32] = step1b[9][i] - step1b[22][i];
- ;output[23 * 32] = step1b[8][i] - step1b[23][i];
- LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
- vadd.s16 q8, q4, q1
- vadd.s16 q9, q5, q0
- vsub.s16 q6, q5, q0
- vsub.s16 q7, q4, q1
- STORE_COMBINE_CENTER_RESULTS_LAST
- ; --------------------------------------------------------------------------
- ; part of final stage
- ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
- ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
- ;output[24 * 32] = step1b[7][i] - step1b[24][i];
- ;output[25 * 32] = step1b[6][i] - step1b[25][i];
- LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
- vadd.s16 q4, q2, q1
- vadd.s16 q5, q3, q0
- vsub.s16 q6, q3, q0
- vsub.s16 q7, q2, q1
- STORE_COMBINE_EXTREME_RESULTS_LAST
- ; --------------------------------------------------------------------------
- ; restore pointers to their initial indices for next band pass by
- ; removing/adding dest_stride * 8. The actual increment by eight
- ; is taken care of within the _LAST macros.
- add r6, r6, r2, lsl #3
- add r9, r9, r2, lsl #3
- sub r7, r7, r2, lsl #3
- sub r10, r10, r2, lsl #3
-
- ; restore r0 by removing the last offset from the last
- ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
- sub r0, r0, #24*8*2
- ; restore r1 by removing the last offset from the last
- ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
- ; advance by 8 columns => 8*2
- sub r1, r1, #25*32*2 - 8*2
- ; advance by 8 lines (8*32*2)
- ; go back by the two pairs from the loop (32*2)
- add r3, r3, #8*32*2 - 32*2
-
- ; bands loop processing
- subs r4, r4, #1
- bne idct32_bands_loop
-
- ; stack operation
- add sp, sp, #512+2048+2048
- vpop {d8-d15}
- pop {r4-r11}
- bx lr
- ENDP ; |vp9_idct32x32_1024_add_neon|
- END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ /dev/null
@@ -1,68 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp9_idct4x4_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct4x4_1_add_neon| PROC
- ldrsh r0, [r0]
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 4)
- add r0, r0, #8 ; + (1 <<((4) - 1))
- asr r0, r0, #4 ; >> 4
-
- vdup.s16 q0, r0 ; duplicate a1
-
- vld1.32 {d2[0]}, [r1], r2
- vld1.32 {d2[1]}, [r1], r2
- vld1.32 {d4[0]}, [r1], r2
- vld1.32 {d4[1]}, [r1]
-
- vaddw.u8 q8, q0, d2 ; dest[x] + a1
- vaddw.u8 q9, q0, d4
-
- vqmovun.s16 d6, q8 ; clip_pixel
- vqmovun.s16 d7, q9
-
- vst1.32 {d6[0]}, [r12], r2
- vst1.32 {d6[1]}, [r12], r2
- vst1.32 {d7[0]}, [r12], r2
- vst1.32 {d7[1]}, [r12]
-
- bx lr
- ENDP ; |vp9_idct4x4_1_add_neon|
-
- END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ /dev/null
@@ -1,190 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_idct4x4_16_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct4x4_16_add_neon| PROC
-
- ; The 2D transform is done with two passes which are actually pretty
- ; similar. We first transform the rows. This is done by transposing
- ; the inputs, doing an SIMD column transform (the columns are the
- ; transposed rows) and then transpose the results (so that it goes back
- ; in normal/row positions). Then, we transform the columns by doing
- ; another SIMD column transform.
- ; So, two passes of a transpose followed by a column transform.
-
- ; load the inputs into q8-q9, d16-d19
- vld1.s16 {q8,q9}, [r0]!
-
- ; generate scalar constants
- ; cospi_8_64 = 15137 = 0x3b21
- mov r0, #0x3b00
- add r0, #0x21
- ; cospi_16_64 = 11585 = 0x2d41
- mov r3, #0x2d00
- add r3, #0x41
- ; cospi_24_64 = 6270 = 0x 187e
- mov r12, #0x1800
- add r12, #0x7e
-
- ; transpose the input data
- ; 00 01 02 03 d16
- ; 10 11 12 13 d17
- ; 20 21 22 23 d18
- ; 30 31 32 33 d19
- vtrn.16 d16, d17
- vtrn.16 d18, d19
-
- ; generate constant vectors
- vdup.16 d20, r0 ; replicate cospi_8_64
- vdup.16 d21, r3 ; replicate cospi_16_64
-
- ; 00 10 02 12 d16
- ; 01 11 03 13 d17
- ; 20 30 22 32 d18
- ; 21 31 23 33 d19
- vtrn.32 q8, q9
- ; 00 10 20 30 d16
- ; 01 11 21 31 d17
- ; 02 12 22 32 d18
- ; 03 13 23 33 d19
-
- vdup.16 d22, r12 ; replicate cospi_24_64
-
- ; do the transform on transposed rows
-
- ; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
- vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
- vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
-
- ; (input[0] + input[2]) * cospi_16_64;
- ; (input[0] - input[2]) * cospi_16_64;
- vmull.s16 q13, d23, d21
- vmull.s16 q14, d24, d21
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
- vmlsl.s16 q15, d19, d20
- vmlal.s16 q1, d19, d22
-
- ; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q1, #14
-
- ; stage 2
- ; output[0] = step[0] + step[3];
- ; output[1] = step[1] + step[2];
- ; output[3] = step[0] - step[3];
- ; output[2] = step[1] - step[2];
- vadd.s16 q8, q13, q14
- vsub.s16 q9, q13, q14
- vswp d18, d19
-
- ; transpose the results
- ; 00 01 02 03 d16
- ; 10 11 12 13 d17
- ; 20 21 22 23 d18
- ; 30 31 32 33 d19
- vtrn.16 d16, d17
- vtrn.16 d18, d19
- ; 00 10 02 12 d16
- ; 01 11 03 13 d17
- ; 20 30 22 32 d18
- ; 21 31 23 33 d19
- vtrn.32 q8, q9
- ; 00 10 20 30 d16
- ; 01 11 21 31 d17
- ; 02 12 22 32 d18
- ; 03 13 23 33 d19
-
- ; do the transform on columns
-
- ; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
- vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
- vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
-
- ; (input[0] + input[2]) * cospi_16_64;
- ; (input[0] - input[2]) * cospi_16_64;
- vmull.s16 q13, d23, d21
- vmull.s16 q14, d24, d21
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
- vmlsl.s16 q15, d19, d20
- vmlal.s16 q1, d19, d22
-
- ; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q1, #14
-
- ; stage 2
- ; output[0] = step[0] + step[3];
- ; output[1] = step[1] + step[2];
- ; output[3] = step[0] - step[3];
- ; output[2] = step[1] - step[2];
- vadd.s16 q8, q13, q14
- vsub.s16 q9, q13, q14
-
- ; The results are in two registers, one of them being swapped. This will
- ; be taken care of by loading the 'dest' value in a swapped fashion and
- ; also storing them in the same swapped fashion.
- ; temp_out[0, 1] = d16, d17 = q8
- ; temp_out[2, 3] = d19, d18 = q9 swapped
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 4)
- vrshr.s16 q8, q8, #4
- vrshr.s16 q9, q9, #4
-
- vld1.32 {d26[0]}, [r1], r2
- vld1.32 {d26[1]}, [r1], r2
- vld1.32 {d27[1]}, [r1], r2
- vld1.32 {d27[0]}, [r1] ; no post-increment
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d26
- vaddw.u8 q9, q9, d27
-
- ; clip_pixel
- vqmovun.s16 d26, q8
- vqmovun.s16 d27, q9
-
- ; do the stores in reverse order with negative post-increment, by changing
- ; the sign of the stride
- rsb r2, r2, #0
- vst1.32 {d27[0]}, [r1], r2
- vst1.32 {d27[1]}, [r1], r2
- vst1.32 {d26[1]}, [r1], r2
- vst1.32 {d26[0]}, [r1] ; no post-increment
- bx lr
- ENDP ; |vp9_idct4x4_16_add_neon|
-
- END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ /dev/null
@@ -1,88 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp9_idct8x8_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct8x8_1_add_neon| PROC
- ldrsh r0, [r0]
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 5)
- add r0, r0, #16 ; + (1 <<((5) - 1))
- asr r0, r0, #5 ; >> 5
-
- vdup.s16 q0, r0 ; duplicate a1
-
- ; load destination data
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r2
- vld1.64 {d17}, [r1]
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r2
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r2
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r2
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r2
- vst1.64 {d31}, [r12], r2
-
- bx lr
- ENDP ; |vp9_idct8x8_1_add_neon|
-
- END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ /dev/null
@@ -1,519 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_idct8x8_64_add_neon|
- EXPORT |vp9_idct8x8_10_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
- ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
- ; This macro will touch q0-q7 registers and use them as buffer during
- ; calculation.
- MACRO
- IDCT8x8_1D
- ; stage 1
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r4 ; duplicate cospi_4_64
- vdup.16 d2, r5 ; duplicate cospi_12_64
- vdup.16 d3, r6 ; duplicate cospi_20_64
-
- ; input[1] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; input[5] * cospi_12_64
- vmull.s16 q5, d26, d2
- vmull.s16 q6, d27, d2
-
- ; input[1]*cospi_28_64-input[7]*cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q5, d22, d3
- vmlsl.s16 q6, d23, d3
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; input[1] * cospi_4_64
- vmull.s16 q2, d18, d1
- vmull.s16 q3, d19, d1
-
- ; input[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q13, d27, d3
-
- ; input[1]*cospi_4_64+input[7]*cospi_28_64
- vmlal.s16 q2, d30, d0
- vmlal.s16 q3, d31, d0
-
- ; input[5] * cospi_20_64 + input[3] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q13, d23, d2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d14, q2, #14 ; >> 14
- vqrshrn.s32 d15, q3, #14 ; >> 14
-
- ; stage 2 & stage 3 - even half
- vdup.16 d0, r7 ; duplicate cospi_16_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q13, #14 ; >> 14
-
- ; input[0] * cospi_16_64
- vmull.s16 q2, d16, d0
- vmull.s16 q3, d17, d0
-
- ; input[0] * cospi_16_64
- vmull.s16 q13, d16, d0
- vmull.s16 q15, d17, d0
-
- ; (input[0] + input[2]) * cospi_16_64
- vmlal.s16 q2, d24, d0
- vmlal.s16 q3, d25, d0
-
- ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q13, d24, d0
- vmlsl.s16 q15, d25, d0
-
- vdup.16 d0, r8 ; duplicate cospi_24_64
- vdup.16 d1, r9 ; duplicate cospi_8_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d18, q2, #14 ; >> 14
- vqrshrn.s32 d19, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d22, q13, #14 ; >> 14
- vqrshrn.s32 d23, q15, #14 ; >> 14
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- ; input[1] * cospi_24_64
- vmull.s16 q2, d20, d0
- vmull.s16 q3, d21, d0
-
- ; input[1] * cospi_8_64
- vmull.s16 q8, d20, d1
- vmull.s16 q12, d21, d1
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q2, d28, d1
- vmlsl.s16 q3, d29, d1
-
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q8, d28, d0
- vmlal.s16 q12, d29, d0
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d26, q2, #14 ; >> 14
- vqrshrn.s32 d27, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d30, q8, #14 ; >> 14
- vqrshrn.s32 d31, q12, #14 ; >> 14
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
- MEND
-
- ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct8x8_64_add_neon| PROC
- push {r4-r9}
- vpush {d8-d15}
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
-
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
-
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
-
- ; First transform rows
- IDCT8x8_1D
-
- ; Transpose the matrix
- TRANSPOSE8X8
-
- ; Then transform columns
- IDCT8x8_1D
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
-
- vpop {d8-d15}
- pop {r4-r9}
- bx lr
- ENDP ; |vp9_idct8x8_64_add_neon|
-
-;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct8x8_10_add_neon| PROC
- push {r4-r9}
- vpush {d8-d15}
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
-
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
-
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
-
- ; First transform rows
- ; stage 1
- ; The following instructions use vqrdmulh to do the
- ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
- ; multiply and shift the result by 16 bits instead of 14 bits. So we need
- ; to double the constants before multiplying to compensate this.
- mov r12, r3, lsl #1
- vdup.16 q0, r12 ; duplicate cospi_28_64*2
- mov r12, r4, lsl #1
- vdup.16 q1, r12 ; duplicate cospi_4_64*2
-
- ; dct_const_round_shift(input[1] * cospi_28_64)
- vqrdmulh.s16 q4, q9, q0
-
- mov r12, r6, lsl #1
- rsb r12, #0
- vdup.16 q0, r12 ; duplicate -cospi_20_64*2
-
- ; dct_const_round_shift(input[1] * cospi_4_64)
- vqrdmulh.s16 q7, q9, q1
-
- mov r12, r5, lsl #1
- vdup.16 q1, r12 ; duplicate cospi_12_64*2
-
- ; dct_const_round_shift(- input[3] * cospi_20_64)
- vqrdmulh.s16 q5, q11, q0
-
- mov r12, r7, lsl #1
- vdup.16 q0, r12 ; duplicate cospi_16_64*2
-
- ; dct_const_round_shift(input[3] * cospi_12_64)
- vqrdmulh.s16 q6, q11, q1
-
- ; stage 2 & stage 3 - even half
- mov r12, r8, lsl #1
- vdup.16 q1, r12 ; duplicate cospi_24_64*2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrdmulh.s16 q9, q8, q0
-
- mov r12, r9, lsl #1
- vdup.16 q0, r12 ; duplicate cospi_8_64*2
-
- ; dct_const_round_shift(input[1] * cospi_24_64)
- vqrdmulh.s16 q13, q10, q1
-
- ; dct_const_round_shift(input[1] * cospi_8_64)
- vqrdmulh.s16 q15, q10, q0
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
-
- ; Transpose the matrix
- TRANSPOSE8X8
-
- ; Then transform columns
- IDCT8x8_1D
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
-
- vpop {d8-d15}
- pop {r4-r9}
- bx lr
- ENDP ; |vp9_idct8x8_10_add_neon|
-
- END
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ /dev/null
@@ -1,237 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht4x4_16_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
- ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
- ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
- ; into d16-d19 registers. This macro will touch q10- q15 registers and use
- ; them as buffer during calculation.
- MACRO
- IDCT4x4_1D
- ; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
- vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64
- vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64
- vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64
- vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
- ; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q10, #14
-
- ; stage 2
- ; output[0] = step[0] + step[3];
- ; output[1] = step[1] + step[2];
- ; output[3] = step[0] - step[3];
- ; output[2] = step[1] - step[2];
- vadd.s16 q8, q13, q14
- vsub.s16 q9, q13, q14
- vswp d18, d19
- MEND
-
- ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
- ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
- ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
- ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
- ; q14,q15 registers and use them as buffer during calculation.
- MACRO
- IADST4x4_1D
- vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0
- vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0
- vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1
- vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2
- vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2
- vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit
- vaddw.s16 q15, q15, d19 ; x0 + x3
- vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3
- vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2
- vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3
-
- vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5
- vadd.s32 q10, q10, q8
- vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6
- vdup.32 q8, r0 ; duplicate sinpi_3_9
- vsub.s32 q11, q11, q9
- vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7
-
- vadd.s32 q13, q10, q12 ; s0 = x0 + x3
- vadd.s32 q10, q10, q11 ; x0 + x1
- vadd.s32 q14, q11, q12 ; s1 = x1 + x3
- vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3
-
- ; dct_const_round_shift
- vqrshrn.s32 d16, q13, #14
- vqrshrn.s32 d17, q14, #14
- vqrshrn.s32 d18, q15, #14
- vqrshrn.s32 d19, q10, #14
- MEND
-
- ; Generate cosine constants in d6 - d8 for the IDCT
- MACRO
- GENERATE_COSINE_CONSTANTS
- ; cospi_8_64 = 15137 = 0x3b21
- mov r0, #0x3b00
- add r0, #0x21
- ; cospi_16_64 = 11585 = 0x2d41
- mov r3, #0x2d00
- add r3, #0x41
- ; cospi_24_64 = 6270 = 0x187e
- mov r12, #0x1800
- add r12, #0x7e
-
- ; generate constant vectors
- vdup.16 d0, r0 ; duplicate cospi_8_64
- vdup.16 d1, r3 ; duplicate cospi_16_64
- vdup.16 d2, r12 ; duplicate cospi_24_64
- MEND
-
- ; Generate sine constants in d1 - d4 for the IADST.
- MACRO
- GENERATE_SINE_CONSTANTS
- ; sinpi_1_9 = 5283 = 0x14A3
- mov r0, #0x1400
- add r0, #0xa3
- ; sinpi_2_9 = 9929 = 0x26C9
- mov r3, #0x2600
- add r3, #0xc9
- ; sinpi_4_9 = 15212 = 0x3B6C
- mov r12, #0x3b00
- add r12, #0x6c
-
- ; generate constant vectors
- vdup.16 d3, r0 ; duplicate sinpi_1_9
-
- ; sinpi_3_9 = 13377 = 0x3441
- mov r0, #0x3400
- add r0, #0x41
-
- vdup.16 d4, r3 ; duplicate sinpi_2_9
- vdup.16 d5, r12 ; duplicate sinpi_4_9
- vdup.16 q3, r0 ; duplicate sinpi_3_9
- MEND
-
- ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
- MACRO
- TRANSPOSE4X4
- vtrn.16 d16, d17
- vtrn.16 d18, d19
- vtrn.32 q8, q9
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
-
- ; transpose the input data
- TRANSPOSE4X4
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IDCT4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IDCT4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
- ; generate constants
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
- ; ROUND_POWER_OF_TWO(temp_out[j], 4)
- vrshr.s16 q8, q8, #4
- vrshr.s16 q9, q9, #4
-
- vld1.32 {d26[0]}, [r1], r2
- vld1.32 {d26[1]}, [r1], r2
- vld1.32 {d27[0]}, [r1], r2
- vld1.32 {d27[1]}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d26
- vaddw.u8 q9, q9, d27
-
- ; clip_pixel
- vqmovun.s16 d26, q8
- vqmovun.s16 d27, q9
-
- ; do the stores in reverse order with negative post-increment, by changing
- ; the sign of the stride
- rsb r2, r2, #0
- vst1.32 {d27[1]}, [r1], r2
- vst1.32 {d27[0]}, [r1], r2
- vst1.32 {d26[1]}, [r1], r2
- vst1.32 {d26[0]}, [r1] ; no post-increment
- bx lr
- ENDP ; |vp9_iht4x4_16_add_neon|
-
- END
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ /dev/null
@@ -1,696 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht8x8_64_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Generate IADST constants in r0 - r12 for the IADST.
- MACRO
- GENERATE_IADST_CONSTANTS
- ; generate cospi_2_64 = 16305
- mov r0, #0x3f00
- add r0, #0xb1
-
- ; generate cospi_30_64 = 1606
- mov r1, #0x600
- add r1, #0x46
-
- ; generate cospi_10_64 = 14449
- mov r2, #0x3800
- add r2, #0x71
-
- ; generate cospi_22_64 = 7723
- mov r3, #0x1e00
- add r3, #0x2b
-
- ; generate cospi_18_64 = 10394
- mov r4, #0x2800
- add r4, #0x9a
-
- ; generate cospi_14_64 = 12665
- mov r5, #0x3100
- add r5, #0x79
-
- ; generate cospi_26_64 = 4756
- mov r6, #0x1200
- add r6, #0x94
-
- ; generate cospi_6_64 = 15679
- mov r7, #0x3d00
- add r7, #0x3f
-
- ; generate cospi_8_64 = 15137
- mov r8, #0x3b00
- add r8, #0x21
-
- ; generate cospi_24_64 = 6270
- mov r9, #0x1800
- add r9, #0x7e
-
- ; generate 0
- mov r10, #0
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
- MEND
-
- ; Generate IDCT constants in r3 - r9 for the IDCT.
- MACRO
- GENERATE_IDCT_CONSTANTS
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
-
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
-
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
- MEND
-
- ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
- ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
- ; will be stored back into q8-q15 registers. This macro will touch q0-q7
- ; registers and use them as buffer during calculation.
- MACRO
- IDCT8x8_1D
- ; stage 1
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r4 ; duplicate cospi_4_64
- vdup.16 d2, r5 ; duplicate cospi_12_64
- vdup.16 d3, r6 ; duplicate cospi_20_64
-
- ; input[1] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; input[5] * cospi_12_64
- vmull.s16 q5, d26, d2
- vmull.s16 q6, d27, d2
-
- ; input[1]*cospi_28_64-input[7]*cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q5, d22, d3
- vmlsl.s16 q6, d23, d3
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; input[1] * cospi_4_64
- vmull.s16 q2, d18, d1
- vmull.s16 q3, d19, d1
-
- ; input[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q13, d27, d3
-
- ; input[1]*cospi_4_64+input[7]*cospi_28_64
- vmlal.s16 q2, d30, d0
- vmlal.s16 q3, d31, d0
-
- ; input[5] * cospi_20_64 + input[3] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q13, d23, d2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d14, q2, #14 ; >> 14
- vqrshrn.s32 d15, q3, #14 ; >> 14
-
- ; stage 2 & stage 3 - even half
- vdup.16 d0, r7 ; duplicate cospi_16_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q13, #14 ; >> 14
-
- ; input[0] * cospi_16_64
- vmull.s16 q2, d16, d0
- vmull.s16 q3, d17, d0
-
- ; input[0] * cospi_16_64
- vmull.s16 q13, d16, d0
- vmull.s16 q15, d17, d0
-
- ; (input[0] + input[2]) * cospi_16_64
- vmlal.s16 q2, d24, d0
- vmlal.s16 q3, d25, d0
-
- ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q13, d24, d0
- vmlsl.s16 q15, d25, d0
-
- vdup.16 d0, r8 ; duplicate cospi_24_64
- vdup.16 d1, r9 ; duplicate cospi_8_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d18, q2, #14 ; >> 14
- vqrshrn.s32 d19, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d22, q13, #14 ; >> 14
- vqrshrn.s32 d23, q15, #14 ; >> 14
-
- ; input[1] * cospi_24_64
- vmull.s16 q2, d20, d0
- vmull.s16 q3, d21, d0
-
- ; input[1] * cospi_8_64
- vmull.s16 q8, d20, d1
- vmull.s16 q12, d21, d1
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q2, d28, d1
- vmlsl.s16 q3, d29, d1
-
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q8, d28, d0
- vmlal.s16 q12, d29, d0
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d26, q2, #14 ; >> 14
- vqrshrn.s32 d27, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d30, q8, #14 ; >> 14
- vqrshrn.s32 d31, q12, #14 ; >> 14
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
- MEND
-
- ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
- ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
- ; output will be stored back into q8-q15 registers. This macro will touch
- ; q0 - q7 registers and use them as buffer during calculation.
- MACRO
- IADST8X8_1D
- vdup.16 d14, r0 ; duplicate cospi_2_64
- vdup.16 d15, r1 ; duplicate cospi_30_64
-
- ; cospi_2_64 * x0
- vmull.s16 q1, d30, d14
- vmull.s16 q2, d31, d14
-
- ; cospi_30_64 * x0
- vmull.s16 q3, d30, d15
- vmull.s16 q4, d31, d15
-
- vdup.16 d30, r4 ; duplicate cospi_18_64
- vdup.16 d31, r5 ; duplicate cospi_14_64
-
- ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- vmlal.s16 q1, d16, d15
- vmlal.s16 q2, d17, d15
-
- ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1
- vmlsl.s16 q3, d16, d14
- vmlsl.s16 q4, d17, d14
-
- ; cospi_18_64 * x4
- vmull.s16 q5, d22, d30
- vmull.s16 q6, d23, d30
-
- ; cospi_14_64 * x4
- vmull.s16 q7, d22, d31
- vmull.s16 q8, d23, d31
-
- ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- vmlal.s16 q5, d24, d31
- vmlal.s16 q6, d25, d31
-
- ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
- vmlsl.s16 q7, d24, d30
- vmlsl.s16 q8, d25, d30
-
- ; (s0 + s4)
- vadd.s32 q11, q1, q5
- vadd.s32 q12, q2, q6
-
- vdup.16 d0, r2 ; duplicate cospi_10_64
- vdup.16 d1, r3 ; duplicate cospi_22_64
-
- ; (s0 - s4)
- vsub.s32 q1, q1, q5
- vsub.s32 q2, q2, q6
-
- ; x0 = dct_const_round_shift(s0 + s4);
- vqrshrn.s32 d22, q11, #14 ; >> 14
- vqrshrn.s32 d23, q12, #14 ; >> 14
-
- ; (s1 + s5)
- vadd.s32 q12, q3, q7
- vadd.s32 q15, q4, q8
-
- ; (s1 - s5)
- vsub.s32 q3, q3, q7
- vsub.s32 q4, q4, q8
-
- ; x4 = dct_const_round_shift(s0 - s4);
- vqrshrn.s32 d2, q1, #14 ; >> 14
- vqrshrn.s32 d3, q2, #14 ; >> 14
-
- ; x1 = dct_const_round_shift(s1 + s5);
- vqrshrn.s32 d24, q12, #14 ; >> 14
- vqrshrn.s32 d25, q15, #14 ; >> 14
-
- ; x5 = dct_const_round_shift(s1 - s5);
- vqrshrn.s32 d6, q3, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
-
- ; cospi_10_64 * x2
- vmull.s16 q4, d26, d0
- vmull.s16 q5, d27, d0
-
- ; cospi_22_64 * x2
- vmull.s16 q2, d26, d1
- vmull.s16 q6, d27, d1
-
- vdup.16 d30, r6 ; duplicate cospi_26_64
- vdup.16 d31, r7 ; duplicate cospi_6_64
-
- ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- vmlal.s16 q4, d20, d1
- vmlal.s16 q5, d21, d1
-
- ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- vmlsl.s16 q2, d20, d0
- vmlsl.s16 q6, d21, d0
-
- ; cospi_26_64 * x6
- vmull.s16 q0, d18, d30
- vmull.s16 q13, d19, d30
-
- ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- vmlal.s16 q0, d28, d31
- vmlal.s16 q13, d29, d31
-
- ; cospi_6_64 * x6
- vmull.s16 q10, d18, d31
- vmull.s16 q9, d19, d31
-
- ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- vmlsl.s16 q10, d28, d30
- vmlsl.s16 q9, d29, d30
-
- ; (s3 + s7)
- vadd.s32 q14, q2, q10
- vadd.s32 q15, q6, q9
-
- ; (s3 - s7)
- vsub.s32 q2, q2, q10
- vsub.s32 q6, q6, q9
-
- ; x3 = dct_const_round_shift(s3 + s7);
- vqrshrn.s32 d28, q14, #14 ; >> 14
- vqrshrn.s32 d29, q15, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s3 - s7);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
-
- ; (s2 + s6)
- vadd.s32 q9, q4, q0
- vadd.s32 q10, q5, q13
-
- ; (s2 - s6)
- vsub.s32 q4, q4, q0
- vsub.s32 q5, q5, q13
-
- vdup.16 d30, r8 ; duplicate cospi_8_64
- vdup.16 d31, r9 ; duplicate cospi_24_64
-
- ; x2 = dct_const_round_shift(s2 + s6);
- vqrshrn.s32 d18, q9, #14 ; >> 14
- vqrshrn.s32 d19, q10, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s2 - s6);
- vqrshrn.s32 d8, q4, #14 ; >> 14
- vqrshrn.s32 d9, q5, #14 ; >> 14
-
- ; cospi_8_64 * x4
- vmull.s16 q5, d2, d30
- vmull.s16 q6, d3, d30
-
- ; cospi_24_64 * x4
- vmull.s16 q7, d2, d31
- vmull.s16 q0, d3, d31
-
- ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- vmlal.s16 q5, d6, d31
- vmlal.s16 q6, d7, d31
-
- ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- vmlsl.s16 q7, d6, d30
- vmlsl.s16 q0, d7, d30
-
- ; cospi_8_64 * x7
- vmull.s16 q1, d4, d30
- vmull.s16 q3, d5, d30
-
- ; cospi_24_64 * x7
- vmull.s16 q10, d4, d31
- vmull.s16 q2, d5, d31
-
- ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- vmlsl.s16 q1, d8, d31
- vmlsl.s16 q3, d9, d31
-
- ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
- vmlal.s16 q10, d8, d30
- vmlal.s16 q2, d9, d30
-
- vadd.s16 q8, q11, q9 ; x0 = s0 + s2;
-
- vsub.s16 q11, q11, q9 ; x2 = s0 - s2;
-
- vadd.s16 q4, q12, q14 ; x1 = s1 + s3;
-
- vsub.s16 q12, q12, q14 ; x3 = s1 - s3;
-
- ; (s4 + s6)
- vadd.s32 q14, q5, q1
- vadd.s32 q15, q6, q3
-
- ; (s4 - s6)
- vsub.s32 q5, q5, q1
- vsub.s32 q6, q6, q3
-
- ; x4 = dct_const_round_shift(s4 + s6);
- vqrshrn.s32 d18, q14, #14 ; >> 14
- vqrshrn.s32 d19, q15, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s4 - s6);
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; (s5 + s7)
- vadd.s32 q1, q7, q10
- vadd.s32 q3, q0, q2
-
- ; (s5 - s7))
- vsub.s32 q7, q7, q10
- vsub.s32 q0, q0, q2
-
- ; x5 = dct_const_round_shift(s5 + s7);
- vqrshrn.s32 d28, q1, #14 ; >> 14
- vqrshrn.s32 d29, q3, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s5 - s7);
- vqrshrn.s32 d14, q7, #14 ; >> 14
- vqrshrn.s32 d15, q0, #14 ; >> 14
-
- vdup.16 d30, r12 ; duplicate cospi_16_64
-
- ; cospi_16_64 * x2
- vmull.s16 q2, d22, d30
- vmull.s16 q3, d23, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q13, d22, d30
- vmull.s16 q1, d23, d30
-
- ; cospi_16_64 * x2 + cospi_16_64 * x3;
- vmlal.s16 q2, d24, d30
- vmlal.s16 q3, d25, d30
-
- ; cospi_16_64 * x2 - cospi_16_64 * x3;
- vmlsl.s16 q13, d24, d30
- vmlsl.s16 q1, d25, d30
-
- ; x2 = dct_const_round_shift(s2);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q3, #14 ; >> 14
-
- ;x3 = dct_const_round_shift(s3);
- vqrshrn.s32 d24, q13, #14 ; >> 14
- vqrshrn.s32 d25, q1, #14 ; >> 14
-
- ; cospi_16_64 * x6
- vmull.s16 q13, d10, d30
- vmull.s16 q1, d11, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q11, d10, d30
- vmull.s16 q0, d11, d30
-
- ; cospi_16_64 * x6 + cospi_16_64 * x7;
- vmlal.s16 q13, d14, d30
- vmlal.s16 q1, d15, d30
-
- ; cospi_16_64 * x6 - cospi_16_64 * x7;
- vmlsl.s16 q11, d14, d30
- vmlsl.s16 q0, d15, d30
-
- ; x6 = dct_const_round_shift(s6);
- vqrshrn.s32 d20, q13, #14 ; >> 14
- vqrshrn.s32 d21, q1, #14 ; >> 14
-
- ;x7 = dct_const_round_shift(s7);
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q0, #14 ; >> 14
-
- vdup.16 q5, r10 ; duplicate 0
-
- vsub.s16 q9, q5, q9 ; output[1] = -x4;
- vsub.s16 q11, q5, q2 ; output[3] = -x2;
- vsub.s16 q13, q5, q6 ; output[5] = -x7;
- vsub.s16 q15, q5, q4 ; output[7] = -x1;
- MEND
-
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
-
- push {r0-r10}
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; first transform rows
- IDCT8x8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; then transform columns
- IADST8X8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; then transform columns
- IDCT8x8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; then transform columns
- IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
- pop {r0-r10}
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
- bx lr
- ENDP ; |vp9_iht8x8_64_add_neon|
-
- END
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -125,16 +125,16 @@
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
--
⑨