ref: 0962cbe2ae535f8432fec37461ca006f113f200d
parent: 8bca154ba09d6cf10f0c92e1acca303f76a66b04
author: Felicia Lim <flim@google.com>
date: Thu Oct 27 13:03:36 EDT 2016
Support encoding 80/100/120 ms frame lengths Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -209,6 +209,9 @@
#define OPUS_FRAMESIZE_20_MS 5004 /**< Use 20 ms frames */
#define OPUS_FRAMESIZE_40_MS 5005 /**< Use 40 ms frames */
#define OPUS_FRAMESIZE_60_MS 5006 /**< Use 60 ms frames */
+#define OPUS_FRAMESIZE_80_MS 5007 /**< Use 80 ms frames */
+#define OPUS_FRAMESIZE_100_MS 5008 /**< Use 100 ms frames */
+#define OPUS_FRAMESIZE_120_MS 5009 /**< Use 120 ms frames */
/**@}*/
@@ -567,6 +570,9 @@
* <dt>OPUS_FRAMESIZE_20_MS</dt><dd>Use 20 ms frames.</dd>
* <dt>OPUS_FRAMESIZE_40_MS</dt><dd>Use 40 ms frames.</dd>
* <dt>OPUS_FRAMESIZE_60_MS</dt><dd>Use 60 ms frames.</dd>
+ * <dt>OPUS_FRAMESIZE_80_MS</dt><dd>Use 80 ms frames.</dd>
+ * <dt>OPUS_FRAMESIZE_100_MS</dt><dd>Use 100 ms frames.</dd>
+ * <dt>OPUS_FRAMESIZE_120_MS</dt><dd>Use 120 ms frames.</dd>
* <dt>OPUS_FRAMESIZE_VARIABLE</dt><dd>Optimize the frame size dynamically.</dd>
* </dl>
* @hideinitializer */
@@ -582,6 +588,9 @@
* <dt>OPUS_FRAMESIZE_20_MS</dt><dd>Use 20 ms frames.</dd>
* <dt>OPUS_FRAMESIZE_40_MS</dt><dd>Use 40 ms frames.</dd>
* <dt>OPUS_FRAMESIZE_60_MS</dt><dd>Use 60 ms frames.</dd>
+ * <dt>OPUS_FRAMESIZE_80_MS</dt><dd>Use 80 ms frames.</dd>
+ * <dt>OPUS_FRAMESIZE_100_MS</dt><dd>Use 100 ms frames.</dd>
+ * <dt>OPUS_FRAMESIZE_120_MS</dt><dd>Use 120 ms frames.</dd>
* <dt>OPUS_FRAMESIZE_VARIABLE</dt><dd>Optimize the frame size dynamically.</dd>
* </dl>
* @hideinitializer */
--- a/src/opus_demo.c
+++ b/src/opus_demo.c
@@ -57,7 +57,7 @@
fprintf(stderr, "-variable-duration : enable frames of variable duration (experimental, experts only); default: disabled\n" );
fprintf(stderr, "-delayed-decision : use look-ahead for speech/music detection (experts only); default: disabled\n" );
fprintf(stderr, "-bandwidth <NB|MB|WB|SWB|FB> : audio bandwidth (from narrowband to fullband); default: sampling rate\n" );
- fprintf(stderr, "-framesize <2.5|5|10|20|40|60> : frame size in ms; default: 20 \n" );
+ fprintf(stderr, "-framesize <2.5|5|10|20|40|60|80|100|120> : frame size in ms; default: 20 \n" );
fprintf(stderr, "-max_payload <bytes> : maximum payload size in bytes, default: 1024\n" );
fprintf(stderr, "-complexity <comp> : complexity, 0 (lowest) ... 10 (highest); default: 10\n" );
fprintf(stderr, "-inbandfec : enable SILK inband FEC\n" );
@@ -383,9 +383,15 @@
frame_size = sampling_rate/25;
else if (strcmp(argv[ args + 1 ], "60")==0)
frame_size = 3*sampling_rate/50;
+ else if (strcmp(argv[ args + 1 ], "80")==0)
+ frame_size = 4*sampling_rate/50;
+ else if (strcmp(argv[ args + 1 ], "100")==0)
+ frame_size = 5*sampling_rate/50;
+ else if (strcmp(argv[ args + 1 ], "120")==0)
+ frame_size = 6*sampling_rate/50;
else {
fprintf(stderr, "Unsupported frame size: %s ms. "
- "Supported are 2.5, 5, 10, 20, 40, 60.\n",
+ "Supported are 2.5, 5, 10, 20, 40, 60, 80, 100, 120.\n",
argv[ args + 1 ]);
return EXIT_FAILURE;
}
@@ -612,8 +618,14 @@
variable_duration = OPUS_FRAMESIZE_20_MS;
else if (frame_size==sampling_rate/25)
variable_duration = OPUS_FRAMESIZE_40_MS;
- else
+ else if (frame_size==3*sampling_rate/50)
variable_duration = OPUS_FRAMESIZE_60_MS;
+ else if (frame_size==4*sampling_rate/50)
+ variable_duration = OPUS_FRAMESIZE_80_MS;
+ else if (frame_size==5*sampling_rate/50)
+ variable_duration = OPUS_FRAMESIZE_100_MS;
+ else
+ variable_duration = OPUS_FRAMESIZE_120_MS;
opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration));
}
frame_size = 2*48000;
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -115,6 +115,7 @@
int nb_no_activity_frames;
opus_val32 peak_signal_energy;
#endif
+ int nonfinal_frame; /* current frame is not the final in a packet */
opus_uint32 rangeFinal;
};
@@ -863,14 +864,20 @@
new_size = frame_size;
else if (variable_duration == OPUS_FRAMESIZE_VARIABLE)
new_size = Fs/50;
- else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS)
- new_size = IMIN(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS));
+ else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_120_MS)
+ {
+ if (variable_duration <= OPUS_FRAMESIZE_40_MS)
+ new_size = (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS);
+ else
+ new_size = (variable_duration-OPUS_FRAMESIZE_2_5_MS-2)*Fs/50;
+ }
else
return -1;
if (new_size>frame_size)
return -1;
- if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs &&
- 50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs)
+ if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs &&
+ 50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs &&
+ 50*new_size!=4*Fs && 50*new_size!=5*Fs && 50*new_size!=6*Fs)
return -1;
return new_size;
}
@@ -1212,6 +1219,7 @@
VARDECL(unsigned char, tmp_data);
int bak_mode, bak_bandwidth, bak_channels, bak_to_mono;
VARDECL(OpusRepacketizer, rp);
+ int max_header_bytes;
opus_int32 bytes_per_frame;
opus_int32 cbr_bytes;
opus_int32 repacketize_len;
@@ -1218,9 +1226,20 @@
int tmp_len;
ALLOC_STACK;
- bytes_per_frame = IMIN(1276, (out_data_bytes-3)/nb_frames);
- ALLOC(tmp_data, nb_frames*bytes_per_frame, unsigned char);
+ /* Worst cases:
+ * 2 frames: Code 2 with different compressed sizes
+ * >2 frames: Code 3 VBR */
+ max_header_bytes = nb_frames == 2 ? 3 : (2+(nb_frames-1)*2);
+ if (st->use_vbr || st->user_bitrate_bps==OPUS_BITRATE_MAX)
+ repacketize_len = out_data_bytes;
+ else {
+ cbr_bytes = 3*st->bitrate_bps/(3*8*st->Fs/(frame_size*nb_frames));
+ repacketize_len = IMIN(cbr_bytes, out_data_bytes);
+ }
+ bytes_per_frame = IMIN(1276, 1+(repacketize_len-max_header_bytes)/nb_frames);
+
+ ALLOC(tmp_data, nb_frames*bytes_per_frame, unsigned char);
ALLOC(rp, 1, OpusRepacketizer);
opus_repacketizer_init(rp);
@@ -1231,8 +1250,8 @@
st->user_forced_mode = st->mode;
st->user_bandwidth = st->bandwidth;
st->force_channels = st->stream_channels;
- bak_to_mono = st->silk_mode.toMono;
+ bak_to_mono = st->silk_mode.toMono;
if (bak_to_mono)
st->force_channels = 1;
else
@@ -1241,6 +1260,7 @@
for (i=0;i<nb_frames;i++)
{
st->silk_mode.toMono = 0;
+ st->nonfinal_frame = i<(nb_frames-1);
/* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */
if (to_celt && i==nb_frames-1)
@@ -1265,14 +1285,7 @@
}
}
- if (st->use_vbr)
- repacketize_len = out_data_bytes;
- else {
- /* Multiply by 3 to avoid inexact division */
- cbr_bytes = 3*st->bitrate_bps/(3*8*st->Fs/(frame_size*nb_frames));
- repacketize_len = IMIN(cbr_bytes, out_data_bytes);
- }
-
+ /* If encoding multiframes recursively, the true number of frames is rp->nb_frames. */
ret = opus_repacketizer_out_range_impl(rp, 0, nb_frames, data, repacketize_len, 0, !st->use_vbr);
if (ret<0)
@@ -1338,7 +1351,8 @@
st->rangeFinal = 0;
if ((!st->variable_duration && 400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs &&
- 50*frame_size != st->Fs && 25*frame_size != st->Fs && 50*frame_size != 3*st->Fs)
+ 50*frame_size != st->Fs && 25*frame_size != st->Fs && 50*frame_size != 3*st->Fs && 50*frame_size != 4*st->Fs &&
+ 50*frame_size != 5*st->Fs && 50*frame_size != 6*st->Fs)
|| (400*frame_size < st->Fs)
|| max_data_bytes<=0
)
@@ -1426,10 +1440,10 @@
{
int cbrBytes;
/* Multiply by 3 to make sure the division is exact. */
- int frame_rate3 = 3*st->Fs/frame_size;
+ int frame_rate6 = 6*st->Fs/frame_size;
/* We need to make sure that "int" values always fit in 16 bits. */
- cbrBytes = IMIN( (3*st->bitrate_bps/8 + frame_rate3/2)/frame_rate3, max_data_bytes);
- st->bitrate_bps = cbrBytes*(opus_int32)frame_rate3*8/3;
+ cbrBytes = IMIN( (6*st->bitrate_bps/8 + frame_rate6/2)/frame_rate6, max_data_bytes);
+ st->bitrate_bps = cbrBytes*(opus_int32)frame_rate6*8/6;
/* Make sure we provide at least one byte to avoid failing. */
max_data_bytes = IMAX(1, cbrBytes);
}
@@ -1571,6 +1585,10 @@
if (st->silk_mode.useDTX && voice_est > 100)
st->mode = MODE_SILK_ONLY;
#endif
+
+ /* If max_data_bytes represents less than 6 kb/s, switch to CELT-only mode */
+ if (max_data_bytes < (frame_rate > 50 ? 9000 : 6000)*frame_size / (st->Fs * 8))
+ st->mode = MODE_CELT_ONLY;
} else {
st->mode = st->user_forced_mode;
}
@@ -1580,20 +1598,7 @@
st->mode = MODE_CELT_ONLY;
if (st->lfe)
st->mode = MODE_CELT_ONLY;
- /* If max_data_bytes represents less than 6 kb/s, switch to CELT-only mode */
- if (max_data_bytes < (frame_rate > 50 ? 9000 : 6000)*frame_size / (st->Fs * 8))
- st->mode = MODE_CELT_ONLY;
- if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0
- && st->mode != MODE_CELT_ONLY && st->prev_mode != MODE_CELT_ONLY)
- {
- /* Delay stereo->mono transition by two frames so that SILK can do a smooth downmix */
- st->silk_mode.toMono = 1;
- st->stream_channels = 2;
- } else {
- st->silk_mode.toMono = 0;
- }
-
if (st->prev_mode > 0 &&
((st->mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY) ||
(st->mode == MODE_CELT_ONLY && st->prev_mode != MODE_CELT_ONLY)))
@@ -1613,6 +1618,18 @@
}
}
+ /* When encoding multiframes, we can ask for a switch to CELT only in the last frame. This switch
+ * is processed above as the requested mode shouldn't interrupt stereo->mono transition. */
+ if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0
+ && st->mode != MODE_CELT_ONLY && st->prev_mode != MODE_CELT_ONLY)
+ {
+ /* Delay stereo->mono transition by two frames so that SILK can do a smooth downmix */
+ st->silk_mode.toMono = 1;
+ st->stream_channels = 2;
+ } else {
+ st->silk_mode.toMono = 0;
+ }
+
/* Update equivalent rate with mode decision. */
equiv_rate = compute_equiv_rate(st->bitrate_bps, st->stream_channels, st->Fs/frame_size,
st->use_vbr, st->mode, st->silk_mode.complexity, st->silk_mode.packetLossPercentage);
@@ -1740,16 +1757,35 @@
if (st->lfe)
st->bandwidth = OPUS_BANDWIDTH_NARROWBAND;
- /* Can't support higher than wideband for >20 ms frames */
- if (frame_size > st->Fs/50 && (st->mode == MODE_CELT_ONLY || st->bandwidth > OPUS_BANDWIDTH_WIDEBAND))
+ curr_bandwidth = st->bandwidth;
+
+ /* Chooses the appropriate mode for speech
+ *NEVER* switch to/from CELT-only mode here as this will invalidate some assumptions */
+ if (st->mode == MODE_SILK_ONLY && curr_bandwidth > OPUS_BANDWIDTH_WIDEBAND)
+ st->mode = MODE_HYBRID;
+ if (st->mode == MODE_HYBRID && curr_bandwidth <= OPUS_BANDWIDTH_WIDEBAND)
+ st->mode = MODE_SILK_ONLY;
+
+ /* Can't support higher than >60 ms frames, and >20 ms when in Hybrid or CELT-only modes */
+ if ((frame_size > st->Fs/50 && (st->mode != MODE_SILK_ONLY)) || frame_size > 3*st->Fs/50)
{
int enc_frame_size;
int nb_frames;
- /* CELT can only support up to 20 ms */
- enc_frame_size = st->Fs/50;
- nb_frames = frame_size > st->Fs/25 ? 3 : 2;
+ if (st->mode == MODE_SILK_ONLY)
+ {
+ if (frame_size == 2*st->Fs/25) /* 80 ms -> 2x 40 ms */
+ enc_frame_size = st->Fs/25;
+ if (frame_size == 3*st->Fs/25) /* 120 ms -> 2x 60 ms */
+ enc_frame_size = 3*st->Fs/50;
+ else /* 100 ms -> 5x 20 ms */
+ enc_frame_size = st->Fs/50;
+ }
+ else
+ enc_frame_size = st->Fs/50;
+ nb_frames = frame_size/enc_frame_size;
+
#ifndef DISABLE_FLOAT_API
if (analysis_read_pos_bak!= -1)
{
@@ -1764,14 +1800,7 @@
RESTORE_STACK;
return ret;
}
- curr_bandwidth = st->bandwidth;
- /* Chooses the appropriate mode for speech
- *NEVER* switch to/from CELT-only mode here as this will invalidate some assumptions */
- if (st->mode == MODE_SILK_ONLY && curr_bandwidth > OPUS_BANDWIDTH_WIDEBAND)
- st->mode = MODE_HYBRID;
- if (st->mode == MODE_HYBRID && curr_bandwidth <= OPUS_BANDWIDTH_WIDEBAND)
- st->mode = MODE_SILK_ONLY;
/* If we decided to go with CELT, make sure redundancy is off, no matter what
we decided earlier. */
if (st->mode == MODE_CELT_ONLY)
@@ -2017,7 +2046,7 @@
silk_assert( st->silk_mode.internalSampleRate == 16000 );
}
- st->silk_mode.opusCanSwitch = st->silk_mode.switchReady;
+ st->silk_mode.opusCanSwitch = st->silk_mode.switchReady && !st->nonfinal_frame;
/* FIXME: How do we allocate the redundancy for CBR? */
if (st->silk_mode.opusCanSwitch)
{
@@ -2801,10 +2830,12 @@
case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST:
{
opus_int32 value = va_arg(ap, opus_int32);
- if (value != OPUS_FRAMESIZE_ARG && value != OPUS_FRAMESIZE_2_5_MS &&
- value != OPUS_FRAMESIZE_5_MS && value != OPUS_FRAMESIZE_10_MS &&
- value != OPUS_FRAMESIZE_20_MS && value != OPUS_FRAMESIZE_40_MS &&
- value != OPUS_FRAMESIZE_60_MS && value != OPUS_FRAMESIZE_VARIABLE)
+ if (value != OPUS_FRAMESIZE_ARG && value != OPUS_FRAMESIZE_2_5_MS &&
+ value != OPUS_FRAMESIZE_5_MS && value != OPUS_FRAMESIZE_10_MS &&
+ value != OPUS_FRAMESIZE_20_MS && value != OPUS_FRAMESIZE_40_MS &&
+ value != OPUS_FRAMESIZE_60_MS && value != OPUS_FRAMESIZE_80_MS &&
+ value != OPUS_FRAMESIZE_100_MS && value != OPUS_FRAMESIZE_120_MS &&
+ value != OPUS_FRAMESIZE_VARIABLE)
{
goto bad_arg;
}
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -835,8 +835,8 @@
return rate_sum;
}
-/* Max size in case the encoder decides to return three frames */
-#define MS_FRAME_TMP (3*1275+7)
+/* Max size in case the encoder decides to return six frames (6 x 20 ms = 120 ms) */
+#define MS_FRAME_TMP (6*1275+12)
static int opus_multistream_encode_native
(
OpusMSEncoder *st,
@@ -903,9 +903,11 @@
}
/* Validate frame_size before using it to allocate stack space.
This mirrors the checks in opus_encode[_float](). */
- if (400*frame_size != Fs && 200*frame_size != Fs &&
- 100*frame_size != Fs && 50*frame_size != Fs &&
- 25*frame_size != Fs && 50*frame_size != 3*Fs)
+ if (400*frame_size != Fs && 200*frame_size != Fs &&
+ 100*frame_size != Fs && 50*frame_size != Fs &&
+ 25*frame_size != Fs && 50*frame_size != 3*Fs &&
+ 50*frame_size != 4*Fs && 50*frame_size != 5*Fs &&
+ 50*frame_size != 6*Fs)
{
RESTORE_STACK;
return OPUS_BAD_ARG;
--- a/tests/test_opus_api.c
+++ b/tests/test_opus_api.c
@@ -1383,6 +1383,15 @@
err=opus_encoder_ctl(enc,OPUS_SET_EXPERT_FRAME_DURATION(OPUS_FRAMESIZE_60_MS));
if(err!=OPUS_OK)test_failed();
cfgs++;
+ err=opus_encoder_ctl(enc,OPUS_SET_EXPERT_FRAME_DURATION(OPUS_FRAMESIZE_80_MS));
+ if(err!=OPUS_OK)test_failed();
+ cfgs++;
+ err=opus_encoder_ctl(enc,OPUS_SET_EXPERT_FRAME_DURATION(OPUS_FRAMESIZE_100_MS));
+ if(err!=OPUS_OK)test_failed();
+ cfgs++;
+ err=opus_encoder_ctl(enc,OPUS_SET_EXPERT_FRAME_DURATION(OPUS_FRAMESIZE_120_MS));
+ if(err!=OPUS_OK)test_failed();
+ cfgs++;
CHECK_SETGET(OPUS_SET_EXPERT_FRAME_DURATION(i),OPUS_GET_EXPERT_FRAME_DURATION(&i),0,-1,
OPUS_FRAMESIZE_60_MS,OPUS_FRAMESIZE_ARG,
" OPUS_SET_EXPERT_FRAME_DURATION ............... OK.\n",
--- a/tests/test_opus_encode.c
+++ b/tests/test_opus_encode.c
@@ -128,6 +128,12 @@
frame_size_enum = OPUS_FRAMESIZE_40_MS;
else if(frame_size==3*sampling_rate/50)
frame_size_enum = OPUS_FRAMESIZE_60_MS;
+ else if(frame_size==4*sampling_rate/50)
+ frame_size_enum = OPUS_FRAMESIZE_80_MS;
+ else if(frame_size==5*sampling_rate/50)
+ frame_size_enum = OPUS_FRAMESIZE_100_MS;
+ else if(frame_size==6*sampling_rate/50)
+ frame_size_enum = OPUS_FRAMESIZE_120_MS;
else
test_failed();
@@ -189,7 +195,9 @@
int use_vbr[3] = {0, 1, 1};
int vbr_constraints[3] = {0, 1, 1};
int complexities[11] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- int max_bandwidths[6] = {OPUS_BANDWIDTH_NARROWBAND, OPUS_BANDWIDTH_MEDIUMBAND, OPUS_BANDWIDTH_WIDEBAND, OPUS_BANDWIDTH_SUPERWIDEBAND, OPUS_BANDWIDTH_FULLBAND, OPUS_BANDWIDTH_FULLBAND};
+ int max_bandwidths[6] = {OPUS_BANDWIDTH_NARROWBAND, OPUS_BANDWIDTH_MEDIUMBAND,
+ OPUS_BANDWIDTH_WIDEBAND, OPUS_BANDWIDTH_SUPERWIDEBAND,
+ OPUS_BANDWIDTH_FULLBAND, OPUS_BANDWIDTH_FULLBAND};
int signals[4] = {OPUS_AUTO, OPUS_AUTO, OPUS_SIGNAL_VOICE, OPUS_SIGNAL_MUSIC};
int inband_fecs[3] = {0, 0, 1};
int packet_loss_perc[4] = {0, 1, 2, 5};
@@ -196,7 +204,7 @@
int lsb_depths[2] = {8, 24};
int prediction_disabled[3] = {0, 0, 1};
int use_dtx[2] = {0, 1};
- int frame_sizes_ms_x2[6] = {5, 10, 20, 40, 80, 120}; /* x2 to avoid 2.5 ms */
+ int frame_sizes_ms_x2[9] = {5, 10, 20, 40, 80, 120, 160, 200, 240}; /* x2 to avoid 2.5 ms */
char debug_info[512];
for (i=0; i<num_encoders; i++) {
@@ -227,6 +235,12 @@
int frame_size = frame_size_ms_x2*sampling_rate/2000;
int frame_size_enum = get_frame_size_enum(frame_size, sampling_rate);
force_channel = IMIN(force_channel, num_channels);
+
+ /* Todo: remove when a fix is available for coding SILK in DTX mode for >60 ms.
+ * Currently, SILK may internally adjust the bandwidth leading to mismatching
+ * bandwidths within a packet. */
+ if (frame_size_ms_x2 > 120)
+ dtx = 0;
sprintf(debug_info,
"fuzz_encoder_settings: %d kHz, %d ch, application: %d, "