shithub: libvpx

Download patch

ref: c4e5c54d69920c07f5d421ba805da1a4c9c3e82d
parent: 3cf5c213f1dcca17c8d3e8f62fd3dbf0cba1a808
author: Linfeng Zhang <linfengz@google.com>
date: Tue Mar 7 10:29:15 EST 2017

cosmetics,dsp/arm/: vpx_idct32x32_{34,135}_add_neon()

No speed changes and disassembly is almost identical.

Change-Id: Id07996237d2607ca6004da5906b7d288b8307e1f

--- a/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -87,614 +87,578 @@
 // 13  84  93 103 110 125
 // 14  98 106 115 127
 // 15 117 128
-static void idct32_12_neon(const tran_low_t *input, int16_t *output) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
-  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int16x8_t in8, in9, in10, in11;
-  int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27,
-      s1_28, s1_29, s1_31;
-  int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21,
-      s2_26, s2_27, s2_28, s2_29;
-  int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22,
-      s3_25, s3_26, s3_29, s3_30;
-  int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18,
-      s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28,
-      s4_29, s4_30, s4_31;
-  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
-      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
-      s5_29;
-  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
-      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
-      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
-  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
-      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
-      s7_25, s7_26, s7_27;
+static void idct32_12_neon(const tran_low_t *const input, int16_t *output) {
+  int16x4_t tmp[8];
+  int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32];
 
-  load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
-  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+               &in[7]);
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
 
-  load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6,
-               &tmp7);
-  transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9,
-                    &in10, &in11);
+  load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5],
+               &tmp[6], &tmp[7]);
+  transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6],
+                    tmp[7], &in[8], &in[9], &in[10], &in[11]);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
-  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
-  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+  s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
 
-  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
-  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+  s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
-  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
-  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+  s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
 
-  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
 
-  s2_18 = vsubq_s16(s1_19, s1_18);
-  s2_19 = vaddq_s16(s1_18, s1_19);
-  s2_20 = vaddq_s16(s1_20, s1_21);
-  s2_21 = vsubq_s16(s1_20, s1_21);
-  s2_26 = vsubq_s16(s1_27, s1_26);
-  s2_27 = vaddq_s16(s1_26, s1_27);
-  s2_28 = vaddq_s16(s1_28, s1_29);
-  s2_29 = vsubq_s16(s1_28, s1_29);
+  s2[18] = vsubq_s16(s1[19], s1[18]);
+  s2[19] = vaddq_s16(s1[18], s1[19]);
+  s2[20] = vaddq_s16(s1[20], s1[21]);
+  s2[21] = vsubq_s16(s1[20], s1[21]);
+  s2[26] = vsubq_s16(s1[27], s1[26]);
+  s2[27] = vaddq_s16(s1[26], s1[27]);
+  s2[28] = vaddq_s16(s1[28], s1[29]);
+  s2[29] = vsubq_s16(s1[28], s1[29]);
 
   // stage 3
-  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s3_10 = vsubq_s16(s2_11, s2_10);
-  s3_11 = vaddq_s16(s2_10, s2_11);
-  s3_12 = vaddq_s16(s2_12, s2_13);
-  s3_13 = vsubq_s16(s2_12, s2_13);
+  s3[10] = vsubq_s16(s2[11], s2[10]);
+  s3[11] = vaddq_s16(s2[10], s2[11]);
+  s3[12] = vaddq_s16(s2[12], s2[13]);
+  s3[13] = vsubq_s16(s2[12], s2[13]);
 
-  s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
-                                                   cospi_28_64);
-  s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
-                                                   cospi_4_64);
+  s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
 
-  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
-                                                   -cospi_4_64);
-  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
-                                                   cospi_28_64);
+  s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+                                                    s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+                                                    cospi_28_64);
 
-  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
-                                                   cospi_12_64);
-  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
-                                                   cospi_20_64);
+  s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+                                                    s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+                                                    cospi_20_64);
 
-  s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
-                                                   -cospi_20_64);
-  s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
-                                                   cospi_12_64);
+  s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
 
   // stage 4
-  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
-  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
-  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+  s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
 
-  s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
-                                                  cospi_24_64);
-  s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
-                                                   cospi_8_64);
-
-  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
-                                                   -cospi_8_64);
-  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+  s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
                                                    cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
 
-  s4_16 = vaddq_s16(s1_16, s2_19);
-  s4_17 = vaddq_s16(s3_17, s3_18);
-  s4_18 = vsubq_s16(s3_17, s3_18);
-  s4_19 = vsubq_s16(s1_16, s2_19);
-  s4_20 = vsubq_s16(s1_23, s2_20);
-  s4_21 = vsubq_s16(s3_22, s3_21);
-  s4_22 = vaddq_s16(s3_21, s3_22);
-  s4_23 = vaddq_s16(s2_20, s1_23);
-  s4_24 = vaddq_s16(s1_24, s2_27);
-  s4_25 = vaddq_s16(s3_25, s3_26);
-  s4_26 = vsubq_s16(s3_25, s3_26);
-  s4_27 = vsubq_s16(s1_24, s2_27);
-  s4_28 = vsubq_s16(s1_31, s2_28);
-  s4_29 = vsubq_s16(s3_30, s3_29);
-  s4_30 = vaddq_s16(s3_29, s3_30);
-  s4_31 = vaddq_s16(s2_28, s1_31);
+  s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+                                                    s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+                                                    cospi_24_64);
 
+  s4[16] = vaddq_s16(s1[16], s2[19]);
+  s4[17] = vaddq_s16(s3[17], s3[18]);
+  s4[18] = vsubq_s16(s3[17], s3[18]);
+  s4[19] = vsubq_s16(s1[16], s2[19]);
+  s4[20] = vsubq_s16(s1[23], s2[20]);
+  s4[21] = vsubq_s16(s3[22], s3[21]);
+  s4[22] = vaddq_s16(s3[21], s3[22]);
+  s4[23] = vaddq_s16(s2[20], s1[23]);
+  s4[24] = vaddq_s16(s1[24], s2[27]);
+  s4[25] = vaddq_s16(s3[25], s3[26]);
+  s4[26] = vsubq_s16(s3[25], s3[26]);
+  s4[27] = vsubq_s16(s1[24], s2[27]);
+  s4[28] = vsubq_s16(s1[31], s2[28]);
+  s4[29] = vsubq_s16(s3[30], s3[29]);
+  s4[30] = vaddq_s16(s3[29], s3[30]);
+  s4[31] = vaddq_s16(s2[28], s1[31]);
+
   // stage 5
-  s5_0 = vaddq_s16(s4_0, s4_3);
-  s5_1 = vaddq_s16(s4_0, s4_2);
-  s5_2 = vsubq_s16(s4_0, s4_2);
-  s5_3 = vsubq_s16(s4_0, s4_3);
+  s5[0] = vaddq_s16(s4[0], s4[3]);
+  s5[1] = vaddq_s16(s4[0], s4[2]);
+  s5[2] = vsubq_s16(s4[0], s4[2]);
+  s5[3] = vsubq_s16(s4[0], s4[3]);
 
-  s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64);
-  s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64);
+  s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64);
 
-  s5_8 = vaddq_s16(s2_8, s3_11);
-  s5_9 = vaddq_s16(s4_9, s4_10);
-  s5_10 = vsubq_s16(s4_9, s4_10);
-  s5_11 = vsubq_s16(s2_8, s3_11);
-  s5_12 = vsubq_s16(s2_15, s3_12);
-  s5_13 = vsubq_s16(s4_14, s4_13);
-  s5_14 = vaddq_s16(s4_13, s4_14);
-  s5_15 = vaddq_s16(s2_15, s3_12);
+  s5[8] = vaddq_s16(s2[8], s3[11]);
+  s5[9] = vaddq_s16(s4[9], s4[10]);
+  s5[10] = vsubq_s16(s4[9], s4[10]);
+  s5[11] = vsubq_s16(s2[8], s3[11]);
+  s5[12] = vsubq_s16(s2[15], s3[12]);
+  s5[13] = vsubq_s16(s4[14], s4[13]);
+  s5[14] = vaddq_s16(s4[13], s4[14]);
+  s5[15] = vaddq_s16(s2[15], s3[12]);
 
-  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
-                                                   cospi_24_64);
-  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
-                                                   cospi_8_64);
+  s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+                                                    cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+                                                    cospi_8_64);
 
-  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
-                                                   cospi_24_64);
-  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
-                                                   cospi_8_64);
+  s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+                                                    cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+                                                    cospi_8_64);
 
-  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
-                                                   -cospi_8_64);
-  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
-                                                   cospi_24_64);
+  s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+                                                    s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+                                                    cospi_24_64);
 
-  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
-                                                   -cospi_8_64);
-  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
-                                                   cospi_24_64);
+  s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+                                                    s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s6_0 = vaddq_s16(s5_0, s3_7);
-  s6_1 = vaddq_s16(s5_1, s5_6);
-  s6_2 = vaddq_s16(s5_2, s5_5);
-  s6_3 = vaddq_s16(s5_3, s3_4);
-  s6_4 = vsubq_s16(s5_3, s3_4);
-  s6_5 = vsubq_s16(s5_2, s5_5);
-  s6_6 = vsubq_s16(s5_1, s5_6);
-  s6_7 = vsubq_s16(s5_0, s3_7);
+  s6[0] = vaddq_s16(s5[0], s3[7]);
+  s6[1] = vaddq_s16(s5[1], s5[6]);
+  s6[2] = vaddq_s16(s5[2], s5[5]);
+  s6[3] = vaddq_s16(s5[3], s3[4]);
+  s6[4] = vsubq_s16(s5[3], s3[4]);
+  s6[5] = vsubq_s16(s5[2], s5[5]);
+  s6[6] = vsubq_s16(s5[1], s5[6]);
+  s6[7] = vsubq_s16(s5[0], s3[7]);
 
-  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
-  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+  s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
 
-  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
-  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+  s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
 
-  s6_16 = vaddq_s16(s4_16, s4_23);
-  s6_17 = vaddq_s16(s4_17, s4_22);
-  s6_18 = vaddq_s16(s5_18, s5_21);
-  s6_19 = vaddq_s16(s5_19, s5_20);
-  s6_20 = vsubq_s16(s5_19, s5_20);
-  s6_21 = vsubq_s16(s5_18, s5_21);
-  s6_22 = vsubq_s16(s4_17, s4_22);
-  s6_23 = vsubq_s16(s4_16, s4_23);
+  s6[16] = vaddq_s16(s4[16], s4[23]);
+  s6[17] = vaddq_s16(s4[17], s4[22]);
+  s6[18] = vaddq_s16(s5[18], s5[21]);
+  s6[19] = vaddq_s16(s5[19], s5[20]);
+  s6[20] = vsubq_s16(s5[19], s5[20]);
+  s6[21] = vsubq_s16(s5[18], s5[21]);
+  s6[22] = vsubq_s16(s4[17], s4[22]);
+  s6[23] = vsubq_s16(s4[16], s4[23]);
 
-  s6_24 = vsubq_s16(s4_31, s4_24);
-  s6_25 = vsubq_s16(s4_30, s4_25);
-  s6_26 = vsubq_s16(s5_29, s5_26);
-  s6_27 = vsubq_s16(s5_28, s5_27);
-  s6_28 = vaddq_s16(s5_27, s5_28);
-  s6_29 = vaddq_s16(s5_26, s5_29);
-  s6_30 = vaddq_s16(s4_25, s4_30);
-  s6_31 = vaddq_s16(s4_24, s4_31);
+  s6[24] = vsubq_s16(s4[31], s4[24]);
+  s6[25] = vsubq_s16(s4[30], s4[25]);
+  s6[26] = vsubq_s16(s5[29], s5[26]);
+  s6[27] = vsubq_s16(s5[28], s5[27]);
+  s6[28] = vaddq_s16(s5[27], s5[28]);
+  s6[29] = vaddq_s16(s5[26], s5[29]);
+  s6[30] = vaddq_s16(s4[25], s4[30]);
+  s6[31] = vaddq_s16(s4[24], s4[31]);
 
   // stage 7
-  s7_0 = vaddq_s16(s6_0, s5_15);
-  s7_1 = vaddq_s16(s6_1, s5_14);
-  s7_2 = vaddq_s16(s6_2, s6_13);
-  s7_3 = vaddq_s16(s6_3, s6_12);
-  s7_4 = vaddq_s16(s6_4, s6_11);
-  s7_5 = vaddq_s16(s6_5, s6_10);
-  s7_6 = vaddq_s16(s6_6, s5_9);
-  s7_7 = vaddq_s16(s6_7, s5_8);
-  s7_8 = vsubq_s16(s6_7, s5_8);
-  s7_9 = vsubq_s16(s6_6, s5_9);
-  s7_10 = vsubq_s16(s6_5, s6_10);
-  s7_11 = vsubq_s16(s6_4, s6_11);
-  s7_12 = vsubq_s16(s6_3, s6_12);
-  s7_13 = vsubq_s16(s6_2, s6_13);
-  s7_14 = vsubq_s16(s6_1, s5_14);
-  s7_15 = vsubq_s16(s6_0, s5_15);
+  s7[0] = vaddq_s16(s6[0], s5[15]);
+  s7[1] = vaddq_s16(s6[1], s5[14]);
+  s7[2] = vaddq_s16(s6[2], s6[13]);
+  s7[3] = vaddq_s16(s6[3], s6[12]);
+  s7[4] = vaddq_s16(s6[4], s6[11]);
+  s7[5] = vaddq_s16(s6[5], s6[10]);
+  s7[6] = vaddq_s16(s6[6], s5[9]);
+  s7[7] = vaddq_s16(s6[7], s5[8]);
+  s7[8] = vsubq_s16(s6[7], s5[8]);
+  s7[9] = vsubq_s16(s6[6], s5[9]);
+  s7[10] = vsubq_s16(s6[5], s6[10]);
+  s7[11] = vsubq_s16(s6[4], s6[11]);
+  s7[12] = vsubq_s16(s6[3], s6[12]);
+  s7[13] = vsubq_s16(s6[2], s6[13]);
+  s7[14] = vsubq_s16(s6[1], s5[14]);
+  s7[15] = vsubq_s16(s6[0], s5[15]);
 
-  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
-  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+  s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
 
-  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
-  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+  s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
 
-  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
-  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+  s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
 
-  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
-  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+  s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
 
   // final stage
-  vst1q_s16(output, vaddq_s16(s7_0, s6_31));
+  vst1q_s16(output, vaddq_s16(s7[0], s6[31]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_1, s6_30));
+  vst1q_s16(output, vaddq_s16(s7[1], s6[30]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_2, s6_29));
+  vst1q_s16(output, vaddq_s16(s7[2], s6[29]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_3, s6_28));
+  vst1q_s16(output, vaddq_s16(s7[3], s6[28]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_4, s7_27));
+  vst1q_s16(output, vaddq_s16(s7[4], s7[27]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_5, s7_26));
+  vst1q_s16(output, vaddq_s16(s7[5], s7[26]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_6, s7_25));
+  vst1q_s16(output, vaddq_s16(s7[6], s7[25]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_7, s7_24));
+  vst1q_s16(output, vaddq_s16(s7[7], s7[24]));
   output += 16;
 
-  vst1q_s16(output, vaddq_s16(s7_8, s7_23));
+  vst1q_s16(output, vaddq_s16(s7[8], s7[23]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_9, s7_22));
+  vst1q_s16(output, vaddq_s16(s7[9], s7[22]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_10, s7_21));
+  vst1q_s16(output, vaddq_s16(s7[10], s7[21]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_11, s7_20));
+  vst1q_s16(output, vaddq_s16(s7[11], s7[20]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_12, s6_19));
+  vst1q_s16(output, vaddq_s16(s7[12], s6[19]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_13, s6_18));
+  vst1q_s16(output, vaddq_s16(s7[13], s6[18]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_14, s6_17));
+  vst1q_s16(output, vaddq_s16(s7[14], s6[17]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_15, s6_16));
+  vst1q_s16(output, vaddq_s16(s7[15], s6[16]));
   output += 16;
 
-  vst1q_s16(output, vsubq_s16(s7_15, s6_16));
+  vst1q_s16(output, vsubq_s16(s7[15], s6[16]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_14, s6_17));
+  vst1q_s16(output, vsubq_s16(s7[14], s6[17]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_13, s6_18));
+  vst1q_s16(output, vsubq_s16(s7[13], s6[18]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_12, s6_19));
+  vst1q_s16(output, vsubq_s16(s7[12], s6[19]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_11, s7_20));
+  vst1q_s16(output, vsubq_s16(s7[11], s7[20]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_10, s7_21));
+  vst1q_s16(output, vsubq_s16(s7[10], s7[21]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_9, s7_22));
+  vst1q_s16(output, vsubq_s16(s7[9], s7[22]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_8, s7_23));
+  vst1q_s16(output, vsubq_s16(s7[8], s7[23]));
   output += 16;
 
-  vst1q_s16(output, vsubq_s16(s7_7, s7_24));
+  vst1q_s16(output, vsubq_s16(s7[7], s7[24]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_6, s7_25));
+  vst1q_s16(output, vsubq_s16(s7[6], s7[25]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_5, s7_26));
+  vst1q_s16(output, vsubq_s16(s7[5], s7[26]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_4, s7_27));
+  vst1q_s16(output, vsubq_s16(s7[4], s7[27]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_3, s6_28));
+  vst1q_s16(output, vsubq_s16(s7[3], s6[28]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_2, s6_29));
+  vst1q_s16(output, vsubq_s16(s7[2], s6[29]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_1, s6_30));
+  vst1q_s16(output, vsubq_s16(s7[1], s6[30]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_0, s6_31));
+  vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
 }
 
-static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-      in13, in14, in15;
-  int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24,
-      s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31;
-  int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17,
-      s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27,
-      s2_28, s2_29, s2_30, s2_31;
-  int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13,
-      s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30;
-  int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14,
-      s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25,
-      s4_26, s4_27, s4_28, s4_29, s4_30, s4_31;
-  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
-      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
-      s5_29;
-  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
-      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
-      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
-  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
-      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
-      s7_25, s7_26, s7_27;
-  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+static void idct32_16_neon(const int16_t *const input, uint8_t *const output,
+                           const int stride) {
+  int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      out[8];
 
-  load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5,
-                             &in6, &in7);
+  load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
 
-  load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12,
-                             &in13, &in14, &in15);
+  load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+                             &in[12], &in[13], &in[14], &in[15]);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
-  s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64);
-  s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64);
+  s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64);
+  s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64);
 
-  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
-  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+  s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
 
-  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
-  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+  s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
 
-  s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64);
-  s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64);
+  s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64);
+  s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
-  s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64);
-  s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64);
+  s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64);
+  s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64);
 
-  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
-  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+  s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
 
-  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
 
-  s2_16 = vaddq_s16(s1_16, s1_17);
-  s2_17 = vsubq_s16(s1_16, s1_17);
-  s2_18 = vsubq_s16(s1_19, s1_18);
-  s2_19 = vaddq_s16(s1_18, s1_19);
-  s2_20 = vaddq_s16(s1_20, s1_21);
-  s2_21 = vsubq_s16(s1_20, s1_21);
-  s2_22 = vsubq_s16(s1_23, s1_22);
-  s2_23 = vaddq_s16(s1_22, s1_23);
-  s2_24 = vaddq_s16(s1_24, s1_25);
-  s2_25 = vsubq_s16(s1_24, s1_25);
-  s2_26 = vsubq_s16(s1_27, s1_26);
-  s2_27 = vaddq_s16(s1_26, s1_27);
-  s2_28 = vaddq_s16(s1_28, s1_29);
-  s2_29 = vsubq_s16(s1_28, s1_29);
-  s2_30 = vsubq_s16(s1_31, s1_30);
-  s2_31 = vaddq_s16(s1_30, s1_31);
+  s2[16] = vaddq_s16(s1[16], s1[17]);
+  s2[17] = vsubq_s16(s1[16], s1[17]);
+  s2[18] = vsubq_s16(s1[19], s1[18]);
+  s2[19] = vaddq_s16(s1[18], s1[19]);
+  s2[20] = vaddq_s16(s1[20], s1[21]);
+  s2[21] = vsubq_s16(s1[20], s1[21]);
+  s2[22] = vsubq_s16(s1[23], s1[22]);
+  s2[23] = vaddq_s16(s1[22], s1[23]);
+  s2[24] = vaddq_s16(s1[24], s1[25]);
+  s2[25] = vsubq_s16(s1[24], s1[25]);
+  s2[26] = vsubq_s16(s1[27], s1[26]);
+  s2[27] = vaddq_s16(s1[26], s1[27]);
+  s2[28] = vaddq_s16(s1[28], s1[29]);
+  s2[29] = vsubq_s16(s1[28], s1[29]);
+  s2[30] = vsubq_s16(s1[31], s1[30]);
+  s2[31] = vaddq_s16(s1[30], s1[31]);
 
   // stage 3
-  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64);
-  s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64);
+  s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64);
+  s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64);
 
-  s3_8 = vaddq_s16(s2_8, s2_9);
-  s3_9 = vsubq_s16(s2_8, s2_9);
-  s3_10 = vsubq_s16(s2_11, s2_10);
-  s3_11 = vaddq_s16(s2_10, s2_11);
-  s3_12 = vaddq_s16(s2_12, s2_13);
-  s3_13 = vsubq_s16(s2_12, s2_13);
-  s3_14 = vsubq_s16(s2_15, s2_14);
-  s3_15 = vaddq_s16(s2_14, s2_15);
+  s3[8] = vaddq_s16(s2[8], s2[9]);
+  s3[9] = vsubq_s16(s2[8], s2[9]);
+  s3[10] = vsubq_s16(s2[11], s2[10]);
+  s3[11] = vaddq_s16(s2[10], s2[11]);
+  s3[12] = vaddq_s16(s2[12], s2[13]);
+  s3[13] = vsubq_s16(s2[12], s2[13]);
+  s3[14] = vsubq_s16(s2[15], s2[14]);
+  s3[15] = vaddq_s16(s2[14], s2[15]);
 
-  s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30,
-                                                   cospi_28_64);
-  s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30,
-                                                   cospi_4_64);
+  s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30],
+                                                    cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30],
+                                                    cospi_4_64);
 
-  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
-                                                   -cospi_4_64);
-  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
-                                                   cospi_28_64);
+  s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+                                                    s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+                                                    cospi_28_64);
 
-  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
-                                                   cospi_12_64);
-  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
-                                                   cospi_20_64);
+  s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+                                                    s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+                                                    cospi_20_64);
 
-  s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25,
-                                                   -cospi_20_64);
-  s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25,
-                                                   cospi_12_64);
+  s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64,
+                                                    s2[25], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64,
+                                                    s2[25], cospi_12_64);
 
   // stage 4
-  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
-  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
-  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+  s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
 
-  s4_4 = vaddq_s16(s3_4, s3_5);
-  s4_5 = vsubq_s16(s3_4, s3_5);
-  s4_6 = vsubq_s16(s3_7, s3_6);
-  s4_7 = vaddq_s16(s3_6, s3_7);
+  s4[4] = vaddq_s16(s3[4], s3[5]);
+  s4[5] = vsubq_s16(s3[4], s3[5]);
+  s4[6] = vsubq_s16(s3[7], s3[6]);
+  s4[7] = vaddq_s16(s3[6], s3[7]);
 
-  s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14,
-                                                  cospi_24_64);
-  s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14,
-                                                   cospi_8_64);
-
-  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
-                                                   -cospi_8_64);
-  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+  s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14],
                                                    cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14],
+                                                    cospi_8_64);
 
-  s4_16 = vaddq_s16(s2_16, s2_19);
-  s4_17 = vaddq_s16(s3_17, s3_18);
-  s4_18 = vsubq_s16(s3_17, s3_18);
-  s4_19 = vsubq_s16(s2_16, s2_19);
-  s4_20 = vsubq_s16(s2_23, s2_20);
-  s4_21 = vsubq_s16(s3_22, s3_21);
-  s4_22 = vaddq_s16(s3_21, s3_22);
-  s4_23 = vaddq_s16(s2_20, s2_23);
-  s4_24 = vaddq_s16(s2_24, s2_27);
-  s4_25 = vaddq_s16(s3_25, s3_26);
-  s4_26 = vsubq_s16(s3_25, s3_26);
-  s4_27 = vsubq_s16(s2_24, s2_27);
-  s4_28 = vsubq_s16(s2_31, s2_28);
-  s4_29 = vsubq_s16(s3_30, s3_29);
-  s4_30 = vaddq_s16(s3_29, s3_30);
-  s4_31 = vaddq_s16(s2_28, s2_31);
+  s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+                                                    s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+                                                    cospi_24_64);
 
+  s4[16] = vaddq_s16(s2[16], s2[19]);
+  s4[17] = vaddq_s16(s3[17], s3[18]);
+  s4[18] = vsubq_s16(s3[17], s3[18]);
+  s4[19] = vsubq_s16(s2[16], s2[19]);
+  s4[20] = vsubq_s16(s2[23], s2[20]);
+  s4[21] = vsubq_s16(s3[22], s3[21]);
+  s4[22] = vaddq_s16(s3[21], s3[22]);
+  s4[23] = vaddq_s16(s2[20], s2[23]);
+  s4[24] = vaddq_s16(s2[24], s2[27]);
+  s4[25] = vaddq_s16(s3[25], s3[26]);
+  s4[26] = vsubq_s16(s3[25], s3[26]);
+  s4[27] = vsubq_s16(s2[24], s2[27]);
+  s4[28] = vsubq_s16(s2[31], s2[28]);
+  s4[29] = vsubq_s16(s3[30], s3[29]);
+  s4[30] = vaddq_s16(s3[29], s3[30]);
+  s4[31] = vaddq_s16(s2[28], s2[31]);
+
   // stage 5
-  s5_0 = vaddq_s16(s4_0, s4_3);
-  s5_1 = vaddq_s16(s4_0, s4_2);
-  s5_2 = vsubq_s16(s4_0, s4_2);
-  s5_3 = vsubq_s16(s4_0, s4_3);
+  s5[0] = vaddq_s16(s4[0], s4[3]);
+  s5[1] = vaddq_s16(s4[0], s4[2]);
+  s5[2] = vsubq_s16(s4[0], s4[2]);
+  s5[3] = vsubq_s16(s4[0], s4[3]);
 
-  s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64);
-  s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64);
+  s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64);
 
-  s5_8 = vaddq_s16(s3_8, s3_11);
-  s5_9 = vaddq_s16(s4_9, s4_10);
-  s5_10 = vsubq_s16(s4_9, s4_10);
-  s5_11 = vsubq_s16(s3_8, s3_11);
-  s5_12 = vsubq_s16(s3_15, s3_12);
-  s5_13 = vsubq_s16(s4_14, s4_13);
-  s5_14 = vaddq_s16(s4_13, s4_14);
-  s5_15 = vaddq_s16(s3_15, s3_12);
+  s5[8] = vaddq_s16(s3[8], s3[11]);
+  s5[9] = vaddq_s16(s4[9], s4[10]);
+  s5[10] = vsubq_s16(s4[9], s4[10]);
+  s5[11] = vsubq_s16(s3[8], s3[11]);
+  s5[12] = vsubq_s16(s3[15], s3[12]);
+  s5[13] = vsubq_s16(s4[14], s4[13]);
+  s5[14] = vaddq_s16(s4[13], s4[14]);
+  s5[15] = vaddq_s16(s3[15], s3[12]);
 
-  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
-                                                   cospi_24_64);
-  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
-                                                   cospi_8_64);
+  s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+                                                    cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+                                                    cospi_8_64);
 
-  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
-                                                   cospi_24_64);
-  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
-                                                   cospi_8_64);
+  s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+                                                    cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+                                                    cospi_8_64);
 
-  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
-                                                   -cospi_8_64);
-  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
-                                                   cospi_24_64);
+  s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+                                                    s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+                                                    cospi_24_64);
 
-  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
-                                                   -cospi_8_64);
-  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
-                                                   cospi_24_64);
+  s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+                                                    s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s6_0 = vaddq_s16(s5_0, s4_7);
-  s6_1 = vaddq_s16(s5_1, s5_6);
-  s6_2 = vaddq_s16(s5_2, s5_5);
-  s6_3 = vaddq_s16(s5_3, s4_4);
-  s6_4 = vsubq_s16(s5_3, s4_4);
-  s6_5 = vsubq_s16(s5_2, s5_5);
-  s6_6 = vsubq_s16(s5_1, s5_6);
-  s6_7 = vsubq_s16(s5_0, s4_7);
+  s6[0] = vaddq_s16(s5[0], s4[7]);
+  s6[1] = vaddq_s16(s5[1], s5[6]);
+  s6[2] = vaddq_s16(s5[2], s5[5]);
+  s6[3] = vaddq_s16(s5[3], s4[4]);
+  s6[4] = vsubq_s16(s5[3], s4[4]);
+  s6[5] = vsubq_s16(s5[2], s5[5]);
+  s6[6] = vsubq_s16(s5[1], s5[6]);
+  s6[7] = vsubq_s16(s5[0], s4[7]);
 
-  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
-  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+  s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
 
-  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
-  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+  s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
 
-  s6_16 = vaddq_s16(s4_16, s4_23);
-  s6_17 = vaddq_s16(s4_17, s4_22);
-  s6_18 = vaddq_s16(s5_18, s5_21);
-  s6_19 = vaddq_s16(s5_19, s5_20);
-  s6_20 = vsubq_s16(s5_19, s5_20);
-  s6_21 = vsubq_s16(s5_18, s5_21);
-  s6_22 = vsubq_s16(s4_17, s4_22);
-  s6_23 = vsubq_s16(s4_16, s4_23);
-  s6_24 = vsubq_s16(s4_31, s4_24);
-  s6_25 = vsubq_s16(s4_30, s4_25);
-  s6_26 = vsubq_s16(s5_29, s5_26);
-  s6_27 = vsubq_s16(s5_28, s5_27);
-  s6_28 = vaddq_s16(s5_27, s5_28);
-  s6_29 = vaddq_s16(s5_26, s5_29);
-  s6_30 = vaddq_s16(s4_25, s4_30);
-  s6_31 = vaddq_s16(s4_24, s4_31);
+  s6[16] = vaddq_s16(s4[16], s4[23]);
+  s6[17] = vaddq_s16(s4[17], s4[22]);
+  s6[18] = vaddq_s16(s5[18], s5[21]);
+  s6[19] = vaddq_s16(s5[19], s5[20]);
+  s6[20] = vsubq_s16(s5[19], s5[20]);
+  s6[21] = vsubq_s16(s5[18], s5[21]);
+  s6[22] = vsubq_s16(s4[17], s4[22]);
+  s6[23] = vsubq_s16(s4[16], s4[23]);
+  s6[24] = vsubq_s16(s4[31], s4[24]);
+  s6[25] = vsubq_s16(s4[30], s4[25]);
+  s6[26] = vsubq_s16(s5[29], s5[26]);
+  s6[27] = vsubq_s16(s5[28], s5[27]);
+  s6[28] = vaddq_s16(s5[27], s5[28]);
+  s6[29] = vaddq_s16(s5[26], s5[29]);
+  s6[30] = vaddq_s16(s4[25], s4[30]);
+  s6[31] = vaddq_s16(s4[24], s4[31]);
 
   // stage 7
-  s7_0 = vaddq_s16(s6_0, s5_15);
-  s7_1 = vaddq_s16(s6_1, s5_14);
-  s7_2 = vaddq_s16(s6_2, s6_13);
-  s7_3 = vaddq_s16(s6_3, s6_12);
-  s7_4 = vaddq_s16(s6_4, s6_11);
-  s7_5 = vaddq_s16(s6_5, s6_10);
-  s7_6 = vaddq_s16(s6_6, s5_9);
-  s7_7 = vaddq_s16(s6_7, s5_8);
-  s7_8 = vsubq_s16(s6_7, s5_8);
-  s7_9 = vsubq_s16(s6_6, s5_9);
-  s7_10 = vsubq_s16(s6_5, s6_10);
-  s7_11 = vsubq_s16(s6_4, s6_11);
-  s7_12 = vsubq_s16(s6_3, s6_12);
-  s7_13 = vsubq_s16(s6_2, s6_13);
-  s7_14 = vsubq_s16(s6_1, s5_14);
-  s7_15 = vsubq_s16(s6_0, s5_15);
+  s7[0] = vaddq_s16(s6[0], s5[15]);
+  s7[1] = vaddq_s16(s6[1], s5[14]);
+  s7[2] = vaddq_s16(s6[2], s6[13]);
+  s7[3] = vaddq_s16(s6[3], s6[12]);
+  s7[4] = vaddq_s16(s6[4], s6[11]);
+  s7[5] = vaddq_s16(s6[5], s6[10]);
+  s7[6] = vaddq_s16(s6[6], s5[9]);
+  s7[7] = vaddq_s16(s6[7], s5[8]);
+  s7[8] = vsubq_s16(s6[7], s5[8]);
+  s7[9] = vsubq_s16(s6[6], s5[9]);
+  s7[10] = vsubq_s16(s6[5], s6[10]);
+  s7[11] = vsubq_s16(s6[4], s6[11]);
+  s7[12] = vsubq_s16(s6[3], s6[12]);
+  s7[13] = vsubq_s16(s6[2], s6[13]);
+  s7[14] = vsubq_s16(s6[1], s5[14]);
+  s7[15] = vsubq_s16(s6[0], s5[15]);
 
-  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
-  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+  s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
 
-  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
-  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+  s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
 
-  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
-  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+  s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
 
-  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
-  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+  s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
 
   // final stage
-  out0 = vaddq_s16(s7_0, s6_31);
-  out1 = vaddq_s16(s7_1, s6_30);
-  out2 = vaddq_s16(s7_2, s6_29);
-  out3 = vaddq_s16(s7_3, s6_28);
-  out4 = vaddq_s16(s7_4, s7_27);
-  out5 = vaddq_s16(s7_5, s7_26);
-  out6 = vaddq_s16(s7_6, s7_25);
-  out7 = vaddq_s16(s7_7, s7_24);
+  out[0] = vaddq_s16(s7[0], s6[31]);
+  out[1] = vaddq_s16(s7[1], s6[30]);
+  out[2] = vaddq_s16(s7[2], s6[29]);
+  out[3] = vaddq_s16(s7[3], s6[28]);
+  out[4] = vaddq_s16(s7[4], s7[27]);
+  out[5] = vaddq_s16(s7[5], s7[26]);
+  out[6] = vaddq_s16(s7[6], s7[25]);
+  out[7] = vaddq_s16(s7[7], s7[24]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
-                       stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output, stride);
 
-  out0 = vaddq_s16(s7_8, s7_23);
-  out1 = vaddq_s16(s7_9, s7_22);
-  out2 = vaddq_s16(s7_10, s7_21);
-  out3 = vaddq_s16(s7_11, s7_20);
-  out4 = vaddq_s16(s7_12, s6_19);
-  out5 = vaddq_s16(s7_13, s6_18);
-  out6 = vaddq_s16(s7_14, s6_17);
-  out7 = vaddq_s16(s7_15, s6_16);
+  out[0] = vaddq_s16(s7[8], s7[23]);
+  out[1] = vaddq_s16(s7[9], s7[22]);
+  out[2] = vaddq_s16(s7[10], s7[21]);
+  out[3] = vaddq_s16(s7[11], s7[20]);
+  out[4] = vaddq_s16(s7[12], s6[19]);
+  out[5] = vaddq_s16(s7[13], s6[18]);
+  out[6] = vaddq_s16(s7[14], s6[17]);
+  out[7] = vaddq_s16(s7[15], s6[16]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (8 * stride), stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output + (8 * stride), stride);
 
-  out0 = vsubq_s16(s7_15, s6_16);
-  out1 = vsubq_s16(s7_14, s6_17);
-  out2 = vsubq_s16(s7_13, s6_18);
-  out3 = vsubq_s16(s7_12, s6_19);
-  out4 = vsubq_s16(s7_11, s7_20);
-  out5 = vsubq_s16(s7_10, s7_21);
-  out6 = vsubq_s16(s7_9, s7_22);
-  out7 = vsubq_s16(s7_8, s7_23);
+  out[0] = vsubq_s16(s7[15], s6[16]);
+  out[1] = vsubq_s16(s7[14], s6[17]);
+  out[2] = vsubq_s16(s7[13], s6[18]);
+  out[3] = vsubq_s16(s7[12], s6[19]);
+  out[4] = vsubq_s16(s7[11], s7[20]);
+  out[5] = vsubq_s16(s7[10], s7[21]);
+  out[6] = vsubq_s16(s7[9], s7[22]);
+  out[7] = vsubq_s16(s7[8], s7[23]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (16 * stride), stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output + (16 * stride), stride);
 
-  out0 = vsubq_s16(s7_7, s7_24);
-  out1 = vsubq_s16(s7_6, s7_25);
-  out2 = vsubq_s16(s7_5, s7_26);
-  out3 = vsubq_s16(s7_4, s7_27);
-  out4 = vsubq_s16(s7_3, s6_28);
-  out5 = vsubq_s16(s7_2, s6_29);
-  out6 = vsubq_s16(s7_1, s6_30);
-  out7 = vsubq_s16(s7_0, s6_31);
+  out[0] = vsubq_s16(s7[7], s7[24]);
+  out[1] = vsubq_s16(s7[6], s7[25]);
+  out[2] = vsubq_s16(s7[5], s7[26]);
+  out[3] = vsubq_s16(s7[4], s7[27]);
+  out[4] = vsubq_s16(s7[3], s6[28]);
+  out[5] = vsubq_s16(s7[2], s6[29]);
+  out[6] = vsubq_s16(s7[1], s6[30]);
+  out[7] = vsubq_s16(s7[0], s6[31]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (24 * stride), stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output + (24 * stride), stride);
 }
 
 void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -36,486 +36,468 @@
 // 6 21 27 33
 // 7 24 32
 static void idct32_6_neon(const tran_low_t *input, int16_t *output) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
-  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
-      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
-      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
-      s1_31;
-  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
-      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
-      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
-      s2_31;
-  int16x8_t s3_24, s3_25, s3_26, s3_27;
+  int16x8_t in[8], s1[32], s2[32], s3[32];
 
-  in0 = load_tran_low_to_s16q(input);
+  in[0] = load_tran_low_to_s16q(input);
   input += 32;
-  in1 = load_tran_low_to_s16q(input);
+  in[1] = load_tran_low_to_s16q(input);
   input += 32;
-  in2 = load_tran_low_to_s16q(input);
+  in[2] = load_tran_low_to_s16q(input);
   input += 32;
-  in3 = load_tran_low_to_s16q(input);
+  in[3] = load_tran_low_to_s16q(input);
   input += 32;
-  in4 = load_tran_low_to_s16q(input);
+  in[4] = load_tran_low_to_s16q(input);
   input += 32;
-  in5 = load_tran_low_to_s16q(input);
+  in[5] = load_tran_low_to_s16q(input);
   input += 32;
-  in6 = load_tran_low_to_s16q(input);
+  in[6] = load_tran_low_to_s16q(input);
   input += 32;
-  in7 = load_tran_low_to_s16q(input);
-  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  in[7] = load_tran_low_to_s16q(input);
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
 
   // stage 1
   // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
   // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
   // stage 3
-  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
-                                                   cospi_28_64);
-  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
-                                                   cospi_4_64);
+  s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
-                                                   cospi_12_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
-                                                   cospi_20_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+                                                    s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+                                                    cospi_20_64);
 
-  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
-                                                   -cospi_20_64);
-  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
-                                                   cospi_12_64);
+  s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
 
   // stage 4
-  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
 
-  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
-                                                  cospi_24_64);
-  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
-                                                   cospi_8_64);
+  s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+                                                   cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
 
-  s2_20 = vsubq_s16(s1_23, s1_20);
-  s2_21 = vsubq_s16(s1_22, s1_21);
-  s2_22 = vaddq_s16(s1_21, s1_22);
-  s2_23 = vaddq_s16(s1_20, s1_23);
-  s2_24 = vaddq_s16(s1_24, s1_27);
-  s2_25 = vaddq_s16(s1_25, s1_26);
-  s2_26 = vsubq_s16(s1_25, s1_26);
-  s2_27 = vsubq_s16(s1_24, s1_27);
+  s2[20] = vsubq_s16(s1[23], s1[20]);
+  s2[21] = vsubq_s16(s1[22], s1[21]);
+  s2[22] = vaddq_s16(s1[21], s1[22]);
+  s2[23] = vaddq_s16(s1[20], s1[23]);
+  s2[24] = vaddq_s16(s1[24], s1[27]);
+  s2[25] = vaddq_s16(s1[25], s1[26]);
+  s2[26] = vsubq_s16(s1[25], s1[26]);
+  s2[27] = vsubq_s16(s1[24], s1[27]);
 
   // stage 5
-  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
-  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+  s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
 
-  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30,
-                                                   cospi_24_64);
-  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30,
-                                                   cospi_8_64);
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
+                                                    cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
+                                                    cospi_8_64);
 
-  s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31,
-                                                   cospi_24_64);
-  s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31,
-                                                   cospi_8_64);
+  s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
+                                                    cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
+                                                    cospi_8_64);
 
-  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
-                                                   -cospi_8_64);
-  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
-                                                   cospi_24_64);
+  s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+                                                    s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+                                                    cospi_24_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
-                                                   -cospi_8_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
-                                                   cospi_24_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+                                                    s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s2_0 = vaddq_s16(s1_0, s1_7);
-  s2_1 = vaddq_s16(s1_0, s1_6);
-  s2_2 = vaddq_s16(s1_0, s1_5);
-  s2_3 = vaddq_s16(s1_0, s1_4);
-  s2_4 = vsubq_s16(s1_0, s1_4);
-  s2_5 = vsubq_s16(s1_0, s1_5);
-  s2_6 = vsubq_s16(s1_0, s1_6);
-  s2_7 = vsubq_s16(s1_0, s1_7);
+  s2[0] = vaddq_s16(s1[0], s1[7]);
+  s2[1] = vaddq_s16(s1[0], s1[6]);
+  s2[2] = vaddq_s16(s1[0], s1[5]);
+  s2[3] = vaddq_s16(s1[0], s1[4]);
+  s2[4] = vsubq_s16(s1[0], s1[4]);
+  s2[5] = vsubq_s16(s1[0], s1[5]);
+  s2[6] = vsubq_s16(s1[0], s1[6]);
+  s2[7] = vsubq_s16(s1[0], s1[7]);
 
-  s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64);
-  s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64);
+  s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
 
-  s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64);
-  s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64);
+  s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
 
-  s2_16 = vaddq_s16(s1_16, s2_23);
-  s2_17 = vaddq_s16(s1_17, s2_22);
-  s2_18 = vaddq_s16(s1_18, s1_21);
-  s2_19 = vaddq_s16(s1_19, s1_20);
-  s2_20 = vsubq_s16(s1_19, s1_20);
-  s2_21 = vsubq_s16(s1_18, s1_21);
-  s2_22 = vsubq_s16(s1_17, s2_22);
-  s2_23 = vsubq_s16(s1_16, s2_23);
+  s2[16] = vaddq_s16(s1[16], s2[23]);
+  s2[17] = vaddq_s16(s1[17], s2[22]);
+  s2[18] = vaddq_s16(s1[18], s1[21]);
+  s2[19] = vaddq_s16(s1[19], s1[20]);
+  s2[20] = vsubq_s16(s1[19], s1[20]);
+  s2[21] = vsubq_s16(s1[18], s1[21]);
+  s2[22] = vsubq_s16(s1[17], s2[22]);
+  s2[23] = vsubq_s16(s1[16], s2[23]);
 
-  s3_24 = vsubq_s16(s1_31, s2_24);
-  s3_25 = vsubq_s16(s1_30, s2_25);
-  s3_26 = vsubq_s16(s1_29, s1_26);
-  s3_27 = vsubq_s16(s1_28, s1_27);
-  s2_28 = vaddq_s16(s1_27, s1_28);
-  s2_29 = vaddq_s16(s1_26, s1_29);
-  s2_30 = vaddq_s16(s2_25, s1_30);
-  s2_31 = vaddq_s16(s2_24, s1_31);
+  s3[24] = vsubq_s16(s1[31], s2[24]);
+  s3[25] = vsubq_s16(s1[30], s2[25]);
+  s3[26] = vsubq_s16(s1[29], s1[26]);
+  s3[27] = vsubq_s16(s1[28], s1[27]);
+  s2[28] = vaddq_s16(s1[27], s1[28]);
+  s2[29] = vaddq_s16(s1[26], s1[29]);
+  s2[30] = vaddq_s16(s2[25], s1[30]);
+  s2[31] = vaddq_s16(s2[24], s1[31]);
 
   // stage 7
-  s1_0 = vaddq_s16(s2_0, s2_15);
-  s1_1 = vaddq_s16(s2_1, s2_14);
-  s1_2 = vaddq_s16(s2_2, s2_13);
-  s1_3 = vaddq_s16(s2_3, s2_12);
-  s1_4 = vaddq_s16(s2_4, s2_11);
-  s1_5 = vaddq_s16(s2_5, s2_10);
-  s1_6 = vaddq_s16(s2_6, s2_9);
-  s1_7 = vaddq_s16(s2_7, s2_8);
-  s1_8 = vsubq_s16(s2_7, s2_8);
-  s1_9 = vsubq_s16(s2_6, s2_9);
-  s1_10 = vsubq_s16(s2_5, s2_10);
-  s1_11 = vsubq_s16(s2_4, s2_11);
-  s1_12 = vsubq_s16(s2_3, s2_12);
-  s1_13 = vsubq_s16(s2_2, s2_13);
-  s1_14 = vsubq_s16(s2_1, s2_14);
-  s1_15 = vsubq_s16(s2_0, s2_15);
+  s1[0] = vaddq_s16(s2[0], s2[15]);
+  s1[1] = vaddq_s16(s2[1], s2[14]);
+  s1[2] = vaddq_s16(s2[2], s2[13]);
+  s1[3] = vaddq_s16(s2[3], s2[12]);
+  s1[4] = vaddq_s16(s2[4], s2[11]);
+  s1[5] = vaddq_s16(s2[5], s2[10]);
+  s1[6] = vaddq_s16(s2[6], s2[9]);
+  s1[7] = vaddq_s16(s2[7], s2[8]);
+  s1[8] = vsubq_s16(s2[7], s2[8]);
+  s1[9] = vsubq_s16(s2[6], s2[9]);
+  s1[10] = vsubq_s16(s2[5], s2[10]);
+  s1[11] = vsubq_s16(s2[4], s2[11]);
+  s1[12] = vsubq_s16(s2[3], s2[12]);
+  s1[13] = vsubq_s16(s2[2], s2[13]);
+  s1[14] = vsubq_s16(s2[1], s2[14]);
+  s1[15] = vsubq_s16(s2[0], s2[15]);
 
-  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
-  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+  s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
 
-  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
-  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+  s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
 
-  s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64);
-  s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64);
+  s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
 
-  s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64);
-  s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64);
+  s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
 
   // final stage
-  vst1q_s16(output, vaddq_s16(s1_0, s2_31));
+  vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_1, s2_30));
+  vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_2, s2_29));
+  vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_3, s2_28));
+  vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_4, s1_27));
+  vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_5, s1_26));
+  vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_6, s1_25));
+  vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_7, s1_24));
+  vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
   output += 8;
 
-  vst1q_s16(output, vaddq_s16(s1_8, s1_23));
+  vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_9, s1_22));
+  vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_10, s1_21));
+  vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_11, s1_20));
+  vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_12, s2_19));
+  vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_13, s2_18));
+  vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_14, s2_17));
+  vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_15, s2_16));
+  vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
   output += 8;
 
-  vst1q_s16(output, vsubq_s16(s1_15, s2_16));
+  vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_14, s2_17));
+  vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_13, s2_18));
+  vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_12, s2_19));
+  vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_11, s1_20));
+  vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_10, s1_21));
+  vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_9, s1_22));
+  vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_8, s1_23));
+  vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
   output += 8;
 
-  vst1q_s16(output, vsubq_s16(s1_7, s1_24));
+  vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_6, s1_25));
+  vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_5, s1_26));
+  vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_4, s1_27));
+  vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_3, s2_28));
+  vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_2, s2_29));
+  vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_1, s2_30));
+  vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_0, s2_31));
+  vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
 }
 
 static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
-  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
-  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
-      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
-      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
-      s1_31;
-  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
-      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
-      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
-      s2_31;
-  int16x8_t s3_24, s3_25, s3_26, s3_27;
+  int16x8_t in[8], s1[32], s2[32], s3[32], out[8];
 
-  load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6,
-                             &in7);
+  load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
   // Different for _8_
-  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
-  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
 
   // stage 3
-  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
-                                                   cospi_28_64);
-  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
-                                                   cospi_4_64);
+  s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
 
   // Different for _8_
-  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28,
-                                                   -cospi_4_64);
-  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28,
-                                                   cospi_28_64);
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
+                                                    s1[28], -cospi_4_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
+                                                    cospi_28_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
-                                                   cospi_12_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
-                                                   cospi_20_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+                                                    s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+                                                    cospi_20_64);
 
-  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
-                                                   -cospi_20_64);
-  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
-                                                   cospi_12_64);
+  s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
 
   // stage 4
-  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
 
-  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
-                                                  cospi_24_64);
-  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
-                                                   cospi_8_64);
-
-  s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12,
-                                                   -cospi_8_64);
-  s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12,
+  s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
                                                    cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
 
-  s2_16 = vaddq_s16(s1_16, s1_19);
+  s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
+                                                    s2[12], -cospi_8_64);
+  s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
+                                                    cospi_24_64);
 
-  s2_17 = vaddq_s16(s1_17, s1_18);
-  s2_18 = vsubq_s16(s1_17, s1_18);
+  s2[16] = vaddq_s16(s1[16], s1[19]);
 
-  s2_19 = vsubq_s16(s1_16, s1_19);
+  s2[17] = vaddq_s16(s1[17], s1[18]);
+  s2[18] = vsubq_s16(s1[17], s1[18]);
 
-  s2_20 = vsubq_s16(s1_23, s1_20);
-  s2_21 = vsubq_s16(s1_22, s1_21);
+  s2[19] = vsubq_s16(s1[16], s1[19]);
 
-  s2_22 = vaddq_s16(s1_21, s1_22);
-  s2_23 = vaddq_s16(s1_20, s1_23);
+  s2[20] = vsubq_s16(s1[23], s1[20]);
+  s2[21] = vsubq_s16(s1[22], s1[21]);
 
-  s2_24 = vaddq_s16(s1_24, s1_27);
-  s2_25 = vaddq_s16(s1_25, s1_26);
-  s2_26 = vsubq_s16(s1_25, s1_26);
-  s2_27 = vsubq_s16(s1_24, s1_27);
+  s2[22] = vaddq_s16(s1[21], s1[22]);
+  s2[23] = vaddq_s16(s1[20], s1[23]);
 
-  s2_28 = vsubq_s16(s1_31, s1_28);
-  s2_29 = vsubq_s16(s1_30, s1_29);
-  s2_30 = vaddq_s16(s1_29, s1_30);
-  s2_31 = vaddq_s16(s1_28, s1_31);
+  s2[24] = vaddq_s16(s1[24], s1[27]);
+  s2[25] = vaddq_s16(s1[25], s1[26]);
+  s2[26] = vsubq_s16(s1[25], s1[26]);
+  s2[27] = vsubq_s16(s1[24], s1[27]);
 
+  s2[28] = vsubq_s16(s1[31], s1[28]);
+  s2[29] = vsubq_s16(s1[30], s1[29]);
+  s2[30] = vaddq_s16(s1[29], s1[30]);
+  s2[31] = vaddq_s16(s1[28], s1[31]);
+
   // stage 5
-  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
-  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+  s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
 
-  s1_8 = vaddq_s16(s2_8, s2_11);
-  s1_9 = vaddq_s16(s2_9, s2_10);
-  s1_10 = vsubq_s16(s2_9, s2_10);
-  s1_11 = vsubq_s16(s2_8, s2_11);
-  s1_12 = vsubq_s16(s2_15, s2_12);
-  s1_13 = vsubq_s16(s2_14, s2_13);
-  s1_14 = vaddq_s16(s2_13, s2_14);
-  s1_15 = vaddq_s16(s2_12, s2_15);
+  s1[8] = vaddq_s16(s2[8], s2[11]);
+  s1[9] = vaddq_s16(s2[9], s2[10]);
+  s1[10] = vsubq_s16(s2[9], s2[10]);
+  s1[11] = vsubq_s16(s2[8], s2[11]);
+  s1[12] = vsubq_s16(s2[15], s2[12]);
+  s1[13] = vsubq_s16(s2[14], s2[13]);
+  s1[14] = vaddq_s16(s2[13], s2[14]);
+  s1[15] = vaddq_s16(s2[12], s2[15]);
 
-  s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29,
-                                                   cospi_24_64);
-  s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29,
-                                                   cospi_8_64);
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
+                                                    cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
+                                                    cospi_8_64);
 
-  s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28,
-                                                   cospi_24_64);
-  s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28,
-                                                   cospi_8_64);
+  s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
+                                                    cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
+                                                    cospi_8_64);
 
-  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
-                                                   -cospi_8_64);
-  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
-                                                   cospi_24_64);
+  s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+                                                    s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+                                                    cospi_24_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
-                                                   -cospi_8_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
-                                                   cospi_24_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+                                                    s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s2_0 = vaddq_s16(s1_0, s1_7);
-  s2_1 = vaddq_s16(s1_0, s1_6);
-  s2_2 = vaddq_s16(s1_0, s1_5);
-  s2_3 = vaddq_s16(s1_0, s1_4);
-  s2_4 = vsubq_s16(s1_0, s1_4);
-  s2_5 = vsubq_s16(s1_0, s1_5);
-  s2_6 = vsubq_s16(s1_0, s1_6);
-  s2_7 = vsubq_s16(s1_0, s1_7);
+  s2[0] = vaddq_s16(s1[0], s1[7]);
+  s2[1] = vaddq_s16(s1[0], s1[6]);
+  s2[2] = vaddq_s16(s1[0], s1[5]);
+  s2[3] = vaddq_s16(s1[0], s1[4]);
+  s2[4] = vsubq_s16(s1[0], s1[4]);
+  s2[5] = vsubq_s16(s1[0], s1[5]);
+  s2[6] = vsubq_s16(s1[0], s1[6]);
+  s2[7] = vsubq_s16(s1[0], s1[7]);
 
-  s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64);
-  s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64);
+  s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
 
-  s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64);
-  s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64);
+  s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
 
-  s1_16 = vaddq_s16(s2_16, s2_23);
-  s1_17 = vaddq_s16(s2_17, s2_22);
-  s2_18 = vaddq_s16(s1_18, s1_21);
-  s2_19 = vaddq_s16(s1_19, s1_20);
-  s2_20 = vsubq_s16(s1_19, s1_20);
-  s2_21 = vsubq_s16(s1_18, s1_21);
-  s1_22 = vsubq_s16(s2_17, s2_22);
-  s1_23 = vsubq_s16(s2_16, s2_23);
+  s1[16] = vaddq_s16(s2[16], s2[23]);
+  s1[17] = vaddq_s16(s2[17], s2[22]);
+  s2[18] = vaddq_s16(s1[18], s1[21]);
+  s2[19] = vaddq_s16(s1[19], s1[20]);
+  s2[20] = vsubq_s16(s1[19], s1[20]);
+  s2[21] = vsubq_s16(s1[18], s1[21]);
+  s1[22] = vsubq_s16(s2[17], s2[22]);
+  s1[23] = vsubq_s16(s2[16], s2[23]);
 
-  s3_24 = vsubq_s16(s2_31, s2_24);
-  s3_25 = vsubq_s16(s2_30, s2_25);
-  s3_26 = vsubq_s16(s1_29, s1_26);
-  s3_27 = vsubq_s16(s1_28, s1_27);
-  s2_28 = vaddq_s16(s1_27, s1_28);
-  s2_29 = vaddq_s16(s1_26, s1_29);
-  s2_30 = vaddq_s16(s2_25, s2_30);
-  s2_31 = vaddq_s16(s2_24, s2_31);
+  s3[24] = vsubq_s16(s2[31], s2[24]);
+  s3[25] = vsubq_s16(s2[30], s2[25]);
+  s3[26] = vsubq_s16(s1[29], s1[26]);
+  s3[27] = vsubq_s16(s1[28], s1[27]);
+  s2[28] = vaddq_s16(s1[27], s1[28]);
+  s2[29] = vaddq_s16(s1[26], s1[29]);
+  s2[30] = vaddq_s16(s2[25], s2[30]);
+  s2[31] = vaddq_s16(s2[24], s2[31]);
 
   // stage 7
-  s1_0 = vaddq_s16(s2_0, s1_15);
-  s1_1 = vaddq_s16(s2_1, s1_14);
-  s1_2 = vaddq_s16(s2_2, s2_13);
-  s1_3 = vaddq_s16(s2_3, s2_12);
-  s1_4 = vaddq_s16(s2_4, s2_11);
-  s1_5 = vaddq_s16(s2_5, s2_10);
-  s1_6 = vaddq_s16(s2_6, s1_9);
-  s1_7 = vaddq_s16(s2_7, s1_8);
-  s1_8 = vsubq_s16(s2_7, s1_8);
-  s1_9 = vsubq_s16(s2_6, s1_9);
-  s1_10 = vsubq_s16(s2_5, s2_10);
-  s1_11 = vsubq_s16(s2_4, s2_11);
-  s1_12 = vsubq_s16(s2_3, s2_12);
-  s1_13 = vsubq_s16(s2_2, s2_13);
-  s1_14 = vsubq_s16(s2_1, s1_14);
-  s1_15 = vsubq_s16(s2_0, s1_15);
+  s1[0] = vaddq_s16(s2[0], s1[15]);
+  s1[1] = vaddq_s16(s2[1], s1[14]);
+  s1[2] = vaddq_s16(s2[2], s2[13]);
+  s1[3] = vaddq_s16(s2[3], s2[12]);
+  s1[4] = vaddq_s16(s2[4], s2[11]);
+  s1[5] = vaddq_s16(s2[5], s2[10]);
+  s1[6] = vaddq_s16(s2[6], s1[9]);
+  s1[7] = vaddq_s16(s2[7], s1[8]);
+  s1[8] = vsubq_s16(s2[7], s1[8]);
+  s1[9] = vsubq_s16(s2[6], s1[9]);
+  s1[10] = vsubq_s16(s2[5], s2[10]);
+  s1[11] = vsubq_s16(s2[4], s2[11]);
+  s1[12] = vsubq_s16(s2[3], s2[12]);
+  s1[13] = vsubq_s16(s2[2], s2[13]);
+  s1[14] = vsubq_s16(s2[1], s1[14]);
+  s1[15] = vsubq_s16(s2[0], s1[15]);
 
-  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
-  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+  s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
 
-  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
-  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+  s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
 
-  s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64);
-  s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64);
+  s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
 
-  s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64);
-  s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64);
+  s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
 
   // final stage
-  out0 = vaddq_s16(s1_0, s2_31);
-  out1 = vaddq_s16(s1_1, s2_30);
-  out2 = vaddq_s16(s1_2, s2_29);
-  out3 = vaddq_s16(s1_3, s2_28);
-  out4 = vaddq_s16(s1_4, s1_27);
-  out5 = vaddq_s16(s1_5, s1_26);
-  out6 = vaddq_s16(s1_6, s1_25);
-  out7 = vaddq_s16(s1_7, s1_24);
+  out[0] = vaddq_s16(s1[0], s2[31]);
+  out[1] = vaddq_s16(s1[1], s2[30]);
+  out[2] = vaddq_s16(s1[2], s2[29]);
+  out[3] = vaddq_s16(s1[3], s2[28]);
+  out[4] = vaddq_s16(s1[4], s1[27]);
+  out[5] = vaddq_s16(s1[5], s1[26]);
+  out[6] = vaddq_s16(s1[6], s1[25]);
+  out[7] = vaddq_s16(s1[7], s1[24]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
-                       stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output, stride);
 
-  out0 = vaddq_s16(s1_8, s2_23);
-  out1 = vaddq_s16(s1_9, s2_22);
-  out2 = vaddq_s16(s1_10, s1_21);
-  out3 = vaddq_s16(s1_11, s1_20);
-  out4 = vaddq_s16(s1_12, s2_19);
-  out5 = vaddq_s16(s1_13, s2_18);
-  out6 = vaddq_s16(s1_14, s1_17);
-  out7 = vaddq_s16(s1_15, s1_16);
+  out[0] = vaddq_s16(s1[8], s2[23]);
+  out[1] = vaddq_s16(s1[9], s2[22]);
+  out[2] = vaddq_s16(s1[10], s1[21]);
+  out[3] = vaddq_s16(s1[11], s1[20]);
+  out[4] = vaddq_s16(s1[12], s2[19]);
+  out[5] = vaddq_s16(s1[13], s2[18]);
+  out[6] = vaddq_s16(s1[14], s1[17]);
+  out[7] = vaddq_s16(s1[15], s1[16]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (8 * stride), stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output + (8 * stride), stride);
 
-  out0 = vsubq_s16(s1_15, s1_16);
-  out1 = vsubq_s16(s1_14, s1_17);
-  out2 = vsubq_s16(s1_13, s2_18);
-  out3 = vsubq_s16(s1_12, s2_19);
-  out4 = vsubq_s16(s1_11, s1_20);
-  out5 = vsubq_s16(s1_10, s1_21);
-  out6 = vsubq_s16(s1_9, s2_22);
-  out7 = vsubq_s16(s1_8, s2_23);
+  out[0] = vsubq_s16(s1[15], s1[16]);
+  out[1] = vsubq_s16(s1[14], s1[17]);
+  out[2] = vsubq_s16(s1[13], s2[18]);
+  out[3] = vsubq_s16(s1[12], s2[19]);
+  out[4] = vsubq_s16(s1[11], s1[20]);
+  out[5] = vsubq_s16(s1[10], s1[21]);
+  out[6] = vsubq_s16(s1[9], s2[22]);
+  out[7] = vsubq_s16(s1[8], s2[23]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (16 * stride), stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output + (16 * stride), stride);
 
-  out0 = vsubq_s16(s1_7, s1_24);
-  out1 = vsubq_s16(s1_6, s1_25);
-  out2 = vsubq_s16(s1_5, s1_26);
-  out3 = vsubq_s16(s1_4, s1_27);
-  out4 = vsubq_s16(s1_3, s2_28);
-  out5 = vsubq_s16(s1_2, s2_29);
-  out6 = vsubq_s16(s1_1, s2_30);
-  out7 = vsubq_s16(s1_0, s2_31);
+  out[0] = vsubq_s16(s1[7], s1[24]);
+  out[1] = vsubq_s16(s1[6], s1[25]);
+  out[2] = vsubq_s16(s1[5], s1[26]);
+  out[3] = vsubq_s16(s1[4], s1[27]);
+  out[4] = vsubq_s16(s1[3], s2[28]);
+  out[5] = vsubq_s16(s1[2], s2[29]);
+  out[6] = vsubq_s16(s1[1], s2[30]);
+  out[7] = vsubq_s16(s1[0], s2[31]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (24 * stride), stride);
+  add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                       out[7], output + (24 * stride), stride);
 }
 
 void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,