shithub: opus

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -182,8 +182,8 @@

       int ret;

       unsigned char buf[8];

       float features[4][NB_TOTAL_FEATURES];

-      //int c0_id, main_pitch, modulation, corr_id, vq_end[3], vq_mid, interp_id;

-      //ret = fscanf(f1, "%d %d %d %d %d %d %d %d %d\n", &c0_id, &main_pitch, &modulation, &corr_id, &vq_end[0], &vq_end[1], &vq_end[2], &vq_mid, &interp_id);

+      /*int c0_id, main_pitch, modulation, corr_id, vq_end[3], vq_mid, interp_id;*/

+      /*ret = fscanf(f1, "%d %d %d %d %d %d %d %d %d\n", &c0_id, &main_pitch, &modulation, &corr_id, &vq_end[0], &vq_end[1], &vq_end[2], &vq_mid, &interp_id);*/

       ret = fread(buf, 1, 8, f1);

       if (ret != 8) break;

       decode_packet(features, vq_mem, buf);

@@ -279,7 +279,7 @@

       st->pcount = 0;

-    //if (fpcm) fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm);

+    /*if (fpcm) fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm);*/

     for (i=0;i<TRAINING_OFFSET;i++) pcm[i] = float2short(x[i+FRAME_SIZE-TRAINING_OFFSET]);

     old_speech_gain = speech_gain;

     count++;

--- a/dnn/lpcnet.c

+++ b/dnn/lpcnet.c

@@ -56,21 +56,22 @@

 #ifdef END2END

 void rc2lpc(float *lpc, const float *rc)

+  int i, j, k;

   float tmp[LPC_ORDER];

   float ntmp[LPC_ORDER] = {0.0};

   RNN_COPY(tmp, rc, LPC_ORDER);

-  for(int i = 0; i < LPC_ORDER ; i++)

+  for(i = 0; i < LPC_ORDER ; i++)

-        for(int j = 0; j <= i-1; j++)

+        for(j = 0; j <= i-1; j++)

             ntmp[j] = tmp[j] + tmp[i]*tmp[i - j - 1];

-        for(int k = 0; k <= i-1; k++)

+        for(k = 0; k <= i-1; k++)

             tmp[k] = ntmp[k];

-  for(int i = 0; i < LPC_ORDER ; i++)

+  for(i = 0; i < LPC_ORDER ; i++)

     lpc[i] = tmp[i];

--- a/dnn/lpcnet_dec.c

+++ b/dnn/lpcnet_dec.c

@@ -92,7 +92,7 @@

   int sub;

   int voiced = 1;

   float frame_corr;

-  ;

+  float sign;

   unpacker bits;

   bits_unpacker_init(&bits, buf, 8);

@@ -105,7 +105,7 @@

   vq_end[2] = bits_unpack(&bits, 10);

   vq_mid = bits_unpack(&bits, 13);

   interp_id = bits_unpack(&bits, 3);

-  //fprintf(stdout, "%d %d %d %d %d %d %d %d %d\n", c0_id, main_pitch, modulation, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);

+  /*fprintf(stdout, "%d %d %d %d %d %d %d %d %d\n", c0_id, main_pitch, modulation, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);*/

   for (i=0;i<4;i++) RNN_CLEAR(&features[i][0], NB_TOTAL_FEATURES);

@@ -133,7 +133,7 @@

     features[3][i+1] = ceps_codebook1[vq_end[0]*NB_BANDS_1 + i] + ceps_codebook2[vq_end[1]*NB_BANDS_1 + i] + ceps_codebook3[vq_end[2]*NB_BANDS_1 + i];

-  float sign = 1;

+  sign = 1;

   if (vq_mid >= 4096) {

     vq_mid -= 4096;

     sign = -1;

--- a/dnn/lpcnet_enc.c

+++ b/dnn/lpcnet_enc.c

@@ -43,9 +43,6 @@

 #include "lpcnet.h"

-//#define NB_FEATURES (NB_BANDS+2+LPC_ORDER)

 #define SURVIVORS 5

@@ -158,10 +155,10 @@

           index2[m][1] = curr_index[m];

           glob_dist[m] = curr_dist[m];

-        //printf("%f ", glob_dist[0]);

+        /*printf("%f ", glob_dist[0]);*/

       } else if (curr_dist[0] < glob_dist[SURVIVORS-1]) {

-        m=0;

         int pos;

+        m=0;

         for (pos=0;pos<SURVIVORS;pos++) {

           if (curr_dist[m] < glob_dist[pos]) {

             int j;

@@ -192,10 +189,10 @@

           index3[m][2] = curr_index[m];

           glob_dist[m] = curr_dist[m];

-        //printf("%f ", glob_dist[0]);

+        /*printf("%f ", glob_dist[0]);*/

       } else if (curr_dist[0] < glob_dist[SURVIVORS-1]) {

-        m=0;

         int pos;

+        m=0;

         for (pos=0;pos<SURVIVORS;pos++) {

           if (curr_dist[m] < glob_dist[pos]) {

             int j;

@@ -217,7 +214,7 @@

     entry[0] = id = index3[0][0];

     entry[1] = id2 = index3[0][1];

     entry[2] = id3 = index3[0][2];

-    //printf("%f ", glob_dist[0]);

+    /*printf("%f ", glob_dist[0]);*/

     for (i=0;i<NB_BANDS_1;i++) {

         x[i] -= ceps_codebook1[id*NB_BANDS_1 + i];

@@ -224,7 +221,7 @@

     for (i=0;i<NB_BANDS_1;i++) {

         x[i] -= ceps_codebook2[id2*NB_BANDS_1 + i];

-    //id3 = vq_quantize(ceps_codebook3, 1024, x, NB_BANDS_1, NULL);

+    /*id3 = vq_quantize(ceps_codebook3, 1024, x, NB_BANDS_1, NULL);*/

     for (i=0;i<NB_BANDS_1;i++) {

         x[i] = ceps_codebook1[id*NB_BANDS_1 + i] + ceps_codebook2[id2*NB_BANDS_1 + i] + ceps_codebook3[id3*NB_BANDS_1 + i];

@@ -304,7 +301,7 @@

     for (i=0;i<NB_BANDS;i++) {

       x[i] = pred[(id&MULTI_MASK)*NB_BANDS + i] + s*codebook[id*NB_BANDS + i];

-    //printf("%d %f ", id&MULTI_MASK, s);

+    /*printf("%d %f ", id&MULTI_MASK, s);*/

     if (0) {

         float err = 0;

         for (i=0;i<NB_BANDS;i++) {

@@ -362,7 +359,7 @@

         best_pred = k;

-    //printf("%d ", best_pred);

+    /*printf("%d ", best_pred);*/

     for (i=0;i<NB_BANDS;i++) {

       x[i] = pred[best_pred*NB_BANDS + i];

@@ -394,7 +391,7 @@

-    //printf("%d %d %f    %d %f\n", id0, id1, dist[0][id0] + dist[1][id1], best_id, min_dist);

+    /*printf("%d %d %f    %d %f\n", id0, id1, dist[0][id0] + dist[1][id1], best_id, min_dist);*/

     return best_id - (best_id >= FORBIDDEN_INTERP);

@@ -531,7 +528,7 @@

     st->pitch_mem[0] = aligned_in[i];

     st->exc_buf[PITCH_MAX_PERIOD+i] = sum + .7*st->pitch_filt;

     st->pitch_filt = sum;

-    //printf("%f\n", st->exc_buf[PITCH_MAX_PERIOD+i]);

+    /*printf("%f\n", st->exc_buf[PITCH_MAX_PERIOD+i]);*/

   /* Cross-correlation on half-frames. */

   for (sub=0;sub<2;sub++) {

@@ -539,7 +536,7 @@

     celt_pitch_xcorr(&st->exc_buf[PITCH_MAX_PERIOD+off], st->exc_buf+off, xcorr, FRAME_SIZE/2, PITCH_MAX_PERIOD);

     ener0 = celt_inner_prod(&st->exc_buf[PITCH_MAX_PERIOD+off], &st->exc_buf[PITCH_MAX_PERIOD+off], FRAME_SIZE/2);

     st->frame_weight[2+2*st->pcount+sub] = ener0;

-    //printf("%f\n", st->frame_weight[2+2*st->pcount+sub]);

+    /*printf("%f\n", st->frame_weight[2+2*st->pcount+sub]);*/

     for (i=0;i<PITCH_MAX_PERIOD;i++) {

       ener = (1 + ener0 + celt_inner_prod(&st->exc_buf[i+off], &st->exc_buf[i+off], FRAME_SIZE/2));

       st->xc[2+2*st->pcount+sub][i] = 2*xcorr[i] / ener;

@@ -619,8 +616,8 @@

     /* Renormalize. */

     for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;

-    //for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);

-    //printf("\n");

+    /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);

+    printf("\n");*/

     RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);

     st->pitch_max_path_all = max_path_all;

     st->best_i = best_i;

@@ -636,9 +633,9 @@

   frame_corr /= 8;

   if (quantize && frame_corr < 0) frame_corr = 0;

   for (sub=0;sub<8;sub++) {

-    //printf("%d %f\n", best[2+sub], frame_corr);

+    /*printf("%d %f\n", best[2+sub], frame_corr);*/

-  //printf("\n");

+  /*printf("\n");*/

   for (sub=2;sub<10;sub++) {

     w = st->frame_weight[sub];

     sw += w;

@@ -663,7 +660,7 @@

     corr_id = (int)floor(frame_corr/.075f);

     if (quantize) frame_corr = 0.0375f + .075f*corr_id;

-  //best_b = (sxx*sy - sx*sxy)/(sw*sxx - sx*sx);

+  /*best_b = (sxx*sy - sx*sxy)/(sw*sxx - sx*sx);*/

   best_b = (sy - best_a*sx)/sw;

   /* Quantizing the pitch as "main" pitch + slope. */

   center_pitch = best_b+5.5*best_a;

@@ -671,9 +668,9 @@

   main_pitch = IMAX(0, IMIN(63, main_pitch));

   modulation = (int)floor(.5 + 16*7*best_a/center_pitch);

   modulation = IMAX(-3, IMIN(3, modulation));

-  //printf("%d %d\n", main_pitch, modulation);

-  //printf("%f %f\n", best_a/center_pitch, best_corr);

-  //for (sub=2;sub<10;sub++) printf("%f %d %f\n", best_b + sub*best_a, best[sub], best_corr);

+  /*printf("%d %d\n", main_pitch, modulation);*/

+  /*printf("%f %f\n", best_a/center_pitch, best_corr);*/

+  /*for (sub=2;sub<10;sub++) printf("%f %d %f\n", best_b + sub*best_a, best[sub], best_corr);*/

   for (sub=0;sub<4;sub++) {

     if (quantize) {

       float p = pow(2.f, main_pitch/21.)*PITCH_MIN_PERIOD;

@@ -685,13 +682,13 @@

       st->features[sub][NB_BANDS] = .01*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);

       st->features[sub][NB_BANDS + 1] = frame_corr-.5;

-    //printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);

+    /*printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);*/

-  //printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);

+  /*printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);*/

   RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD);

   RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD);

   if (quantize) {

-    //printf("%f\n", st->features[3][0]);

+    /*printf("%f\n", st->features[3][0]);*/

     c0_id = (int)floor(.5 + st->features[3][0]*4);

     c0_id = IMAX(-64, IMIN(63, c0_id));

     st->features[3][0] = c0_id/4.;

@@ -705,11 +702,11 @@

     lpc_from_cepstrum(st->lpc, st->features[sub]);

     for (i=0;i<LPC_ORDER;i++) st->features[sub][NB_BANDS+2+i] = st->lpc[i];

-  //printf("\n");

+  /*printf("\n");*/

   RNN_COPY(st->vq_mem, &st->features[3][0], NB_BANDS);

   if (encode) {

     packer bits;

-    //fprintf(stdout, "%d %d %d %d %d %d %d %d %d\n", c0_id+64, main_pitch, voiced ? modulation+4 : 0, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);

+    /*fprintf(stdout, "%d %d %d %d %d %d %d %d %d\n", c0_id+64, main_pitch, voiced ? modulation+4 : 0, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);*/

     bits_packer_init(&bits, buf, 8);

     bits_pack(&bits, c0_id+64, 7);

     bits_pack(&bits, main_pitch, 6);

@@ -765,8 +762,8 @@

     /* Renormalize. */

     for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;

-    //for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);

-    //printf("\n");

+    /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);

+    printf("\n");*/

     RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);

     st->pitch_max_path_all = max_path_all;

     st->best_i = best_i;

@@ -783,12 +780,12 @@

   for (sub=0;sub<4;sub++) {

     st->features[sub][NB_BANDS] = .01*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);

     st->features[sub][NB_BANDS + 1] = frame_corr-.5;

-    //printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);

+    /*printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);*/

-  //printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);

+  /*printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);*/

   RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD);

   RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD);

-  //printf("\n");

+  /*printf("\n");*/

   RNN_COPY(st->vq_mem, &st->features[3][0], NB_BANDS);

   if (ffeat) {

     for (i=0;i<4;i++) {

@@ -833,8 +830,8 @@

     /* Renormalize. */

     for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;

-    //for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);

-    //printf("\n");

+    /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);

+    printf("\n");*/

     RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);

     st->pitch_max_path_all = max_path_all;

     st->best_i = best_i;

--- a/dnn/lpcnet_plc.c

+++ b/dnn/lpcnet_plc.c

@@ -57,7 +57,7 @@

   short output[FRAME_SIZE];

   st->enc.pcount = 0;

   if (st->skip_analysis) {

-    //fprintf(stderr, "skip update\n");

+    /*fprintf(stderr, "skip update\n");*/

     if (st->blend) {

       short tmp[FRAME_SIZE-TRAINING_OFFSET];

       lpcnet_synthesize_tail_impl(&st->lpcnet, tmp, FRAME_SIZE-TRAINING_OFFSET, 0);

@@ -73,10 +73,10 @@

       RNN_COPY(&st->pcm[st->pcm_fill], pcm, FRAME_SIZE);

       st->pcm_fill += FRAME_SIZE;

-    //fprintf(stderr, "fill at %d\n", st->pcm_fill);

+    /*fprintf(stderr, "fill at %d\n", st->pcm_fill);*/

   /* Update state. */

-  //fprintf(stderr, "update state\n");

+  /*fprintf(stderr, "update state\n");*/

   for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];

   preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);

   compute_frame_features(&st->enc, x);

@@ -105,7 +105,7 @@

   /* If we concealed the previous frame, finish synthesizing the rest of the samples. */

   /* FIXME: Copy/predict features. */

   while (st->pcm_fill > 0) {

-    //fprintf(stderr, "update state for PLC %d\n", st->pcm_fill);

+    /*fprintf(stderr, "update state for PLC %d\n", st->pcm_fill);*/

     int update_count;

     update_count = IMIN(st->pcm_fill, FRAME_SIZE);

     RNN_COPY(output, &st->pcm[0], update_count);

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -144,6 +144,8 @@

 int sample_mdense(const MDenseLayer *layer, const float *input, const float *sampling_logit_table, kiss99_ctx *rng)

    int b, j, N, M, C, stride;

+   int val=0;

+   float thresholds[8];

    M = layer->nb_inputs;

    N = layer->nb_neurons;

    C = layer->nb_channels;

@@ -151,8 +153,6 @@

    stride = M*C;

    celt_assert(N <= DUAL_FC_OUT_SIZE);

-   int val=0;

-   float thresholds[8];

    /* Computing all the random thresholds in advance. These thresholds are directly

       based on the logit to avoid computing the sigmoid.*/

@@ -181,7 +181,7 @@

       sum1 = layer->factor[i]*tanh_approx(sum1);

       sum2 = layer->factor[N + i]*tanh_approx(sum2);

       sum1 += sum2;

-      //sum1 = 1.f/(1 + exp(-sum1));

+      /*sum1 = 1.f/(1 + exp(-sum1));*/

 #if 1 /* Sample the decision based on the logit. */

       bit = thresholds[b] < sum1;

 #else

--- a/dnn/pitch.c

+++ b/dnn/pitch.c

@@ -37,9 +37,6 @@

 #include "pitch.h"

 #include "common.h"

-//#include "modes.h"

-//#include "stack_alloc.h"

-//#include "mathops.h"

 #include "celt_lpc.h"

 #include "math.h"

--- a/dnn/pitch.h

+++ b/dnn/pitch.h

@@ -34,8 +34,6 @@

 #ifndef PITCH_H

 #define PITCH_H

-//#include "modes.h"

-//#include "cpu_support.h"

 #include "arch.h"

 /* OPT: This is the kernel you really want to optimize. It gets used a lot

--- a/dnn/vec.h

+++ b/dnn/vec.h

@@ -47,7 +47,7 @@

 #ifndef DISABLE_DOT_PROD

 #define DOT_PROD

-//#define USE_SU_BIAS

+/*#define USE_SU_BIAS*/

 #endif

 #ifdef DOT_PROD

--- a/dnn/vec_avx.h

+++ b/dnn/vec_avx.h

@@ -320,8 +320,6 @@

        __m256 xf;

        __m256i xi;

        xf = _mm256_loadu_ps(&_x[i]);

-       //xf = _mm256_mul_ps(xf, const127);

-       //xf = _mm256_add_ps(xf, const127);

        xf = _mm256_fmadd_ps(xf, const127, const127);

        xi = _mm256_cvtps_epi32(xf);

        xi = _mm256_packus_epi32(xi,  _mm256_setzero_si256());

@@ -328,7 +326,6 @@

        xi = _mm256_permute4x64_epi64(xi, 0xD8);

        xi = _mm256_packus_epi16(xi, _mm256_setzero_si256());

        xi = _mm256_permutevar8x32_epi32(xi, _mm256_setr_epi32(0,1, 0,0, 0,0, 0,0));

-       //xi = _mm256_permute4x64_epi64(xi, 0x);

        _mm256_storeu_si256 ((__m256i *)&x[i], xi);

@@ -618,7 +615,7 @@

    int i, j;

    for (i=0;i<rows;i+=16)

-      float * restrict y;

+      float *y;

       __m256 vy0, vy8;

       y = &out[i];

       vy0 = _mm256_loadu_ps(&y[0]);

@@ -644,7 +641,7 @@

    int i, j;

    for (i=0;i<rows;i+=16)

-      float * restrict y;

+      float *y;

       int cols;

       __m256 vy0, vy8;

       y = &out[i];

@@ -692,7 +689,7 @@

    unsigned char x[MAX_INPUTS];

    (void)col_stride;

    ones = _mm256_set1_epi16(1);

-   //for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

+   /*for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);*/

    vector_ps_to_epi8(x, _x, cols);

    for (i=0;i<rows;i+=8)

@@ -709,26 +706,26 @@

          __m256i vxj;

          __m256i vw;

          vxj = _mm256_set1_epi32(*(int*)&x[j]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

          vxj = _mm256_set1_epi32(*(int*)&x[j+4]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

          vxj = _mm256_set1_epi32(*(int*)&x[j+8]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

          vxj = _mm256_set1_epi32(*(int*)&x[j+12]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

@@ -740,8 +737,8 @@

          __m256i vxj;

          __m256i vw;

          vxj = _mm256_set1_epi32(*(int*)&x[j]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

@@ -763,7 +760,7 @@

       for (j=0;j<cols;j+=4)

-         float * restrict y;

+         float *y;

          float xj0, xj1, xj2, xj3;

          xj0 = x[j+0];

          xj1 = x[j+1];

@@ -791,7 +788,7 @@

    int i, j;

    unsigned char x[MAX_INPUTS];

    ones = _mm256_set1_epi16(1);

-   //for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

+   /*for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);*/

    vector_ps_to_epi8(x, _x, cols);

    for (i=0;i<rows;i+=8)

@@ -810,26 +807,26 @@

          __m256i vxj;

          __m256i vw;

          vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

          vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

          vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

          vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

@@ -843,8 +840,8 @@

          int pos;

          pos = (*idx++);

          vxj = _mm256_set1_epi32(*(int*)&x[pos]);

-         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

-         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         vw = _mm256_loadu_si256((const __m256i *)w);

+         tmp = _mm256_maddubs_epi16(vxj, vw);

          tmp = _mm256_madd_epi16(tmp, ones);

          vy0 = _mm256_add_epi32(vy0, tmp);

          w += 32;

@@ -866,7 +863,7 @@

    (void)ignore;

    for (i=0;i<rows;i+=8)

-      float * restrict y;

+      float *y;

       int cols;

       __m256 vy0;

       y = &out[i];

--

⑨