shithub: sox

Download patch

ref: 3efafed515a2aa862e36d62e99b7d81e9a34b1d1
parent: b93f6ec1744a597a46183f56d5bc3f127fabeaca
author: robs <robs>
date: Sun Jun 14 07:32:34 EDT 2009

doc updates

--- a/ChangeLog
+++ b/ChangeLog
@@ -85,7 +85,7 @@
   o New `fir' filter effect using external coefficients/file.  (robs)
   o New `biquad' filter effect using external coefficients.  (robs)
   o New `overdrive' effect.  (robs)
-  o New `vad' Voice Activity Detector effect (undocumented as yet).  (robs)
+  o New `vad' Voice Activity Detector effect.  (robs)
   o `synth' enhancements: can now set common parameters for multiple
     channels, new `pluck' and `tpdf' types, `scientific' note
     notation, [2778142] just intonation.  (robs)
--- a/NEWS
+++ b/NEWS
@@ -6,12 +6,12 @@
 Release highlights include:
  
   o New filter effects: `sinc', `fir', `biquad'.
-  o Other new effects: `stats', `overdrive'.
+  o Other new effects: `stats', `overdrive', `vad'.
   o New audio device handler for OpenBSD.
   o Fixed problems with temporary file on Windows.
   o Can now enable automated clipping protection for most effects.
   o Automatically `dither' as needed.
-  o Improvements to AIFF, WAV, FLAC, MP3 handlers
+  o Improvements to AIFF, WAV, FLAC, MP3 handlers.
   o ALSA driver now supports 24-bit.
   o `spectrogram' effect enhancements including multi-channel support.
   o `synth' effect enhancements including new `pluck' type.
--- a/scripts/synth.sh
+++ b/scripts/synth.sh
@@ -16,6 +16,11 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
+# Demonstrates the use of some of the features new in SoX 14.3.0, viz
+# nested SoX commands, the synth `pluck' type, and the overdrive
+# effect (also used are several other effects).
+# Music (c) 2008 robs@users.sourceforge.net.  All rights reserved.
+
 sox=../src/sox
 
 G0="pl %-26 pl %-19 pl %-14 pl %-10 pl %-7 pl %-2"
--- a/sox.1
+++ b/sox.1
@@ -3891,6 +3891,100 @@
 it.  A value of 8000s will wait until 8000 samples are read before
 starting to process audio.
 .TP
+\fBvad \fR[\fIoptions\fR]
+Voice Activity Detector.  Attempts to trim silence and quiet
+background sounds from the ends of (fairly high resolution
+i.e. 16-bit, 44-48kHz) recordings of speech.  The algorithm currently
+uses a simple cepstral power measurement to detect voice, so may be
+fooled by other things, especially music.  The effect can trim only
+from the front of the audio, so in order to trim from the back, the
+.B reverse
+effect must also be used.  E.g.
+.EX
+   play speech.wav norm vad
+.EE
+to trim from the front,
+.EX
+   play speech.wav norm reverse vad reverse
+.EE
+to trim from the back, and
+.EX
+   play speech.wav norm vad reverse vad reverse
+.EE
+to trim from both ends.  The use of the
+.B norm
+effect is recommended, but remember that neither
+.B reverse
+nor
+.B norm
+is suitable for use with streamed audio.
+.SP
+.I Options:
+.br
+Default values are shown in parenthesis.
+.RS
+.IP \fB\-t\ \fInum\fR\ (7)
+The measurement level used to trigger activity detection.  This might
+need to be changed depending on the noise level, signal level and
+other charactistics of the input audio.
+.IP \fB\-T\ \fInum\fR\ (0.25)
+The time constant (in seconds) used to help ignore short bursts of
+sound.
+.IP \fB\-s\ \fInum\fR\ (1)
+The amount of audio (in seconds) to search for quieter/shorter bursts
+of audio to include prior to the detected trigger point.
+.IP \fB\-g\ \fInum\fR\ (0.25)
+Allowed gap (in seconds) between quieter/shorter bursts of audio to
+include prior to the detected trigger point.
+.IP \fB\-p\ \fInum\fR\ (0)
+The amount of audio (in seconds) to preseve before the trigger point
+and any found quieter/shorter bursts.
+.RE
+.TP
+\ 
+.I Advanced Options:
+.br
+These allow fine tuning of the alogithm's internal parameters.
+.RS
+.IP \fB\-b\ \fInum\fR
+The algorithm (internally) uses adaptive noise estimation/reduction in
+order to detect the start of the wanted audio.  This option sets the
+time for the initial noise estimate.
+.IP \fB\-N\ \fInum\fR
+Time constant used by the adaptive noise estimator for when the noise
+level is increasing.
+.IP \fB\-n\ \fInum\fR
+Time constant used by the adaptive noise estimator for when the noise
+level is decreasing.
+.IP \fB\-r\ \fInum\fR
+Amount of noise reduction to use in the detection algorithm (e.g. 0,
+0.5, ...).
+.IP \fB\-f\ \fInum\fR
+Frequency of the algorithm's processing/measurements.
+.IP \fB\-m\ \fInum\fR
+Measurement duration; by default, twice the measurement period; i.e.
+with overlap.
+.IP \fB\-M\ \fInum\fR
+Time constant used to smooth spectral measurements.
+.IP \fB\-h\ \fInum\fR
+`Brick-wall' frequency of high-pass filter applied at the input to the
+detector algorithm.
+.IP \fB\-l\ \fInum\fR
+`Brick-wall' frequency of low-pass filter applied at the input to the
+detector algorithm.
+.IP \fB\-H\ \fInum\fR
+`Brick-wall' quefrency of high-pass lifter used in the detector
+algorithm.
+.IP \fB\-L\ \fInum\fR
+`Brick-wall' quefrency of low-pass lifter used in the detector
+algorithm.
+.RE
+.TP
+\ 
+See also the
+.B silence
+effect.
+.TP
 \fBvol \fIgain\fR [\fItype\fR [\fIlimitergain\fR]]
 Apply an amplification or an attenuation to the audio signal.
 Unlike the
--- a/src/vad.c
+++ b/src/vad.c
@@ -35,8 +35,8 @@
   unsigned  measureTimer_ns, measureLen_ws, measureLen_ns;
   unsigned  spectrumStart, spectrumEnd, cepstrumStart, cepstrumEnd; /* bins */
   int       bootCountMax, bootCount;
-  double    measureTcMult, triggerMeasTcMult;
   double    noiseTcUpMult, noiseTcDownMult;
+  double    measureTcMult, triggerMeasTcMult;
   double    * spectrumWindow, * cepstrumWindow;
   chan_t    * channels;
 } priv_t;
@@ -155,7 +155,7 @@
 
   p->bootCountMax = p->bootTime * p->measureFreq - .5;
   p->measureTimer_ns = p->measureLen_ns;
-  p->flushedLen_ns = p->samplesIndex_ns = 0;
+  p->bootCount = p->measuresIndex = p->flushedLen_ns = p->samplesIndex_ns = 0;
   return SOX_SUCCESS;
 }
 
@@ -302,6 +302,12 @@
   };
   static char const * lines[] = {
     "[options]",
+    "\t-t trigger-level                (7)",
+    "\t-T trigger-time-constant        (0.25 s)",
+    "\t-s search-time                  (1 s)",
+    "\t-g allowed-gap                  (0.25 s)",
+    "\t-p pre-trigger-time             (0 s)",
+    "Advanced options:",
     "\t-b noise-est-boot-time          (0.35 s)",
     "\t-N noise-est-time-constant-up   (0.1 s)",
     "\t-n noise-est-time-constant-down (0.01 s)",
@@ -313,11 +319,6 @@
     "\t-l low-pass-filter              (6000 Hz)",
     "\t-H high-pass-lifter             (150 Hz)",
     "\t-L low-pass-lifter              (2000 Hz)",
-    "\t-T trigger-time-constant        (0.25 s)",
-    "\t-t trigger-level                (7)",
-    "\t-s search-time                  (1 s)",
-    "\t-g allowed-gap                  (0.25 s)",
-    "\t-p pre-trigger-time             (0 s)",
   };
   static char * usage;
   handler.usage = lsx_usage_lines(&usage, lines, array_length(lines));