ref: 3efafed515a2aa862e36d62e99b7d81e9a34b1d1
parent: b93f6ec1744a597a46183f56d5bc3f127fabeaca
author: robs <robs>
date: Sun Jun 14 07:32:34 EDT 2009
doc updates
--- a/ChangeLog
+++ b/ChangeLog
@@ -85,7 +85,7 @@
o New `fir' filter effect using external coefficients/file. (robs)
o New `biquad' filter effect using external coefficients. (robs)
o New `overdrive' effect. (robs)
- o New `vad' Voice Activity Detector effect (undocumented as yet). (robs)
+ o New `vad' Voice Activity Detector effect. (robs)
o `synth' enhancements: can now set common parameters for multiple
channels, new `pluck' and `tpdf' types, `scientific' note
notation, [2778142] just intonation. (robs)
--- a/NEWS
+++ b/NEWS
@@ -6,12 +6,12 @@
Release highlights include:
o New filter effects: `sinc', `fir', `biquad'.
- o Other new effects: `stats', `overdrive'.
+ o Other new effects: `stats', `overdrive', `vad'.
o New audio device handler for OpenBSD.
o Fixed problems with temporary file on Windows.
o Can now enable automated clipping protection for most effects.
o Automatically `dither' as needed.
- o Improvements to AIFF, WAV, FLAC, MP3 handlers
+ o Improvements to AIFF, WAV, FLAC, MP3 handlers.
o ALSA driver now supports 24-bit.
o `spectrogram' effect enhancements including multi-channel support.
o `synth' effect enhancements including new `pluck' type.
--- a/scripts/synth.sh
+++ b/scripts/synth.sh
@@ -16,6 +16,11 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+# Demonstrates the use of some of the features new in SoX 14.3.0, viz
+# nested SoX commands, the synth `pluck' type, and the overdrive
+# effect (also used are several other effects).
+# Music (c) 2008 robs@users.sourceforge.net. All rights reserved.
+
sox=../src/sox
G0="pl %-26 pl %-19 pl %-14 pl %-10 pl %-7 pl %-2"
--- a/sox.1
+++ b/sox.1
@@ -3891,6 +3891,100 @@
it. A value of 8000s will wait until 8000 samples are read before
starting to process audio.
.TP
+\fBvad \fR[\fIoptions\fR]
+Voice Activity Detector. Attempts to trim silence and quiet
+background sounds from the ends of (fairly high resolution
+i.e. 16-bit, 44-48kHz) recordings of speech. The algorithm currently
+uses a simple cepstral power measurement to detect voice, so may be
+fooled by other things, especially music. The effect can trim only
+from the front of the audio, so in order to trim from the back, the
+.B reverse
+effect must also be used. E.g.
+.EX
+ play speech.wav norm vad
+.EE
+to trim from the front,
+.EX
+ play speech.wav norm reverse vad reverse
+.EE
+to trim from the back, and
+.EX
+ play speech.wav norm vad reverse vad reverse
+.EE
+to trim from both ends. The use of the
+.B norm
+effect is recommended, but remember that neither
+.B reverse
+nor
+.B norm
+is suitable for use with streamed audio.
+.SP
+.I Options:
+.br
+Default values are shown in parenthesis.
+.RS
+.IP \fB\-t\ \fInum\fR\ (7)
+The measurement level used to trigger activity detection. This might
+need to be changed depending on the noise level, signal level and
+other charactistics of the input audio.
+.IP \fB\-T\ \fInum\fR\ (0.25)
+The time constant (in seconds) used to help ignore short bursts of
+sound.
+.IP \fB\-s\ \fInum\fR\ (1)
+The amount of audio (in seconds) to search for quieter/shorter bursts
+of audio to include prior to the detected trigger point.
+.IP \fB\-g\ \fInum\fR\ (0.25)
+Allowed gap (in seconds) between quieter/shorter bursts of audio to
+include prior to the detected trigger point.
+.IP \fB\-p\ \fInum\fR\ (0)
+The amount of audio (in seconds) to preseve before the trigger point
+and any found quieter/shorter bursts.
+.RE
+.TP
+\
+.I Advanced Options:
+.br
+These allow fine tuning of the alogithm's internal parameters.
+.RS
+.IP \fB\-b\ \fInum\fR
+The algorithm (internally) uses adaptive noise estimation/reduction in
+order to detect the start of the wanted audio. This option sets the
+time for the initial noise estimate.
+.IP \fB\-N\ \fInum\fR
+Time constant used by the adaptive noise estimator for when the noise
+level is increasing.
+.IP \fB\-n\ \fInum\fR
+Time constant used by the adaptive noise estimator for when the noise
+level is decreasing.
+.IP \fB\-r\ \fInum\fR
+Amount of noise reduction to use in the detection algorithm (e.g. 0,
+0.5, ...).
+.IP \fB\-f\ \fInum\fR
+Frequency of the algorithm's processing/measurements.
+.IP \fB\-m\ \fInum\fR
+Measurement duration; by default, twice the measurement period; i.e.
+with overlap.
+.IP \fB\-M\ \fInum\fR
+Time constant used to smooth spectral measurements.
+.IP \fB\-h\ \fInum\fR
+`Brick-wall' frequency of high-pass filter applied at the input to the
+detector algorithm.
+.IP \fB\-l\ \fInum\fR
+`Brick-wall' frequency of low-pass filter applied at the input to the
+detector algorithm.
+.IP \fB\-H\ \fInum\fR
+`Brick-wall' quefrency of high-pass lifter used in the detector
+algorithm.
+.IP \fB\-L\ \fInum\fR
+`Brick-wall' quefrency of low-pass lifter used in the detector
+algorithm.
+.RE
+.TP
+\
+See also the
+.B silence
+effect.
+.TP
\fBvol \fIgain\fR [\fItype\fR [\fIlimitergain\fR]]
Apply an amplification or an attenuation to the audio signal.
Unlike the
--- a/src/vad.c
+++ b/src/vad.c
@@ -35,8 +35,8 @@
unsigned measureTimer_ns, measureLen_ws, measureLen_ns;
unsigned spectrumStart, spectrumEnd, cepstrumStart, cepstrumEnd; /* bins */
int bootCountMax, bootCount;
- double measureTcMult, triggerMeasTcMult;
double noiseTcUpMult, noiseTcDownMult;
+ double measureTcMult, triggerMeasTcMult;
double * spectrumWindow, * cepstrumWindow;
chan_t * channels;
} priv_t;
@@ -155,7 +155,7 @@
p->bootCountMax = p->bootTime * p->measureFreq - .5;
p->measureTimer_ns = p->measureLen_ns;
- p->flushedLen_ns = p->samplesIndex_ns = 0;
+ p->bootCount = p->measuresIndex = p->flushedLen_ns = p->samplesIndex_ns = 0;
return SOX_SUCCESS;
}
@@ -302,6 +302,12 @@
};
static char const * lines[] = {
"[options]",
+ "\t-t trigger-level (7)",
+ "\t-T trigger-time-constant (0.25 s)",
+ "\t-s search-time (1 s)",
+ "\t-g allowed-gap (0.25 s)",
+ "\t-p pre-trigger-time (0 s)",
+ "Advanced options:",
"\t-b noise-est-boot-time (0.35 s)",
"\t-N noise-est-time-constant-up (0.1 s)",
"\t-n noise-est-time-constant-down (0.01 s)",
@@ -313,11 +319,6 @@
"\t-l low-pass-filter (6000 Hz)",
"\t-H high-pass-lifter (150 Hz)",
"\t-L low-pass-lifter (2000 Hz)",
- "\t-T trigger-time-constant (0.25 s)",
- "\t-t trigger-level (7)",
- "\t-s search-time (1 s)",
- "\t-g allowed-gap (0.25 s)",
- "\t-p pre-trigger-time (0 s)",
};
static char * usage;
handler.usage = lsx_usage_lines(&usage, lines, array_length(lines));