• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_coding/codecs/opus/opus_interface.h"
12 
13 #include <cstdlib>
14 
15 #include <numeric>
16 
17 #include "api/array_view.h"
18 #include "rtc_base/checks.h"
19 #include "system_wrappers/include/field_trial.h"
20 
21 enum {
22 #if WEBRTC_OPUS_SUPPORT_120MS_PTIME
23   /* Maximum supported frame size in WebRTC is 120 ms. */
24   kWebRtcOpusMaxEncodeFrameSizeMs = 120,
25 #else
26   /* Maximum supported frame size in WebRTC is 60 ms. */
27   kWebRtcOpusMaxEncodeFrameSizeMs = 60,
28 #endif
29 
30   /* The format allows up to 120 ms frames. Since we don't control the other
31    * side, we must allow for packets of that size. NetEq is currently limited
32    * to 60 ms on the receive side. */
33   kWebRtcOpusMaxDecodeFrameSizeMs = 120,
34 
35   // Duration of audio that each call to packet loss concealment covers.
36   kWebRtcOpusPlcFrameSizeMs = 10,
37 };
38 
39 constexpr char kPlcUsePrevDecodedSamplesFieldTrial[] =
40     "WebRTC-Audio-OpusPlcUsePrevDecodedSamples";
41 
42 constexpr char kAvoidNoisePumpingDuringDtxFieldTrial[] =
43     "WebRTC-Audio-OpusAvoidNoisePumpingDuringDtx";
44 
FrameSizePerChannel(int frame_size_ms,int sample_rate_hz)45 static int FrameSizePerChannel(int frame_size_ms, int sample_rate_hz) {
46   RTC_DCHECK_GT(frame_size_ms, 0);
47   RTC_DCHECK_EQ(frame_size_ms % 10, 0);
48   RTC_DCHECK_GT(sample_rate_hz, 0);
49   RTC_DCHECK_EQ(sample_rate_hz % 1000, 0);
50   return frame_size_ms * (sample_rate_hz / 1000);
51 }
52 
53 // Maximum sample count per channel.
MaxFrameSizePerChannel(int sample_rate_hz)54 static int MaxFrameSizePerChannel(int sample_rate_hz) {
55   return FrameSizePerChannel(kWebRtcOpusMaxDecodeFrameSizeMs, sample_rate_hz);
56 }
57 
58 // Default sample count per channel.
DefaultFrameSizePerChannel(int sample_rate_hz)59 static int DefaultFrameSizePerChannel(int sample_rate_hz) {
60   return FrameSizePerChannel(20, sample_rate_hz);
61 }
62 
63 // Returns true if the `encoded` payload corresponds to a refresh DTX packet
64 // whose energy is larger than the expected for non activity packets.
WebRtcOpus_IsHighEnergyRefreshDtxPacket(OpusEncInst * inst,rtc::ArrayView<const int16_t> frame,rtc::ArrayView<const uint8_t> encoded)65 static bool WebRtcOpus_IsHighEnergyRefreshDtxPacket(
66     OpusEncInst* inst,
67     rtc::ArrayView<const int16_t> frame,
68     rtc::ArrayView<const uint8_t> encoded) {
69   if (encoded.size() <= 2) {
70     return false;
71   }
72   int number_frames =
73       frame.size() / DefaultFrameSizePerChannel(inst->sample_rate_hz);
74   if (number_frames > 0 &&
75       WebRtcOpus_PacketHasVoiceActivity(encoded.data(), encoded.size()) == 0) {
76     const float average_frame_energy =
77         std::accumulate(frame.begin(), frame.end(), 0.0f,
78                         [](float a, int32_t b) { return a + b * b; }) /
79         number_frames;
80     if (WebRtcOpus_GetInDtx(inst) == 1 &&
81         average_frame_energy >= inst->smooth_energy_non_active_frames * 0.5f) {
82       // This is a refresh DTX packet as the encoder is in DTX and has
83       // produced a payload > 2 bytes. This refresh packet has a higher energy
84       // than the smooth energy of non activity frames (with a 3 dB negative
85       // margin) and, therefore, it is flagged as a high energy refresh DTX
86       // packet.
87       return true;
88     }
89     // The average energy is tracked in a similar way as the modeling of the
90     // comfort noise in the Silk decoder in Opus
91     // (third_party/opus/src/silk/CNG.c).
92     if (average_frame_energy < inst->smooth_energy_non_active_frames * 0.5f) {
93       inst->smooth_energy_non_active_frames = average_frame_energy;
94     } else {
95       inst->smooth_energy_non_active_frames +=
96           (average_frame_energy - inst->smooth_energy_non_active_frames) *
97           0.25f;
98     }
99   }
100   return false;
101 }
102 
WebRtcOpus_EncoderCreate(OpusEncInst ** inst,size_t channels,int32_t application,int sample_rate_hz)103 int16_t WebRtcOpus_EncoderCreate(OpusEncInst** inst,
104                                  size_t channels,
105                                  int32_t application,
106                                  int sample_rate_hz) {
107   int opus_app;
108   if (!inst)
109     return -1;
110 
111   switch (application) {
112     case 0:
113       opus_app = OPUS_APPLICATION_VOIP;
114       break;
115     case 1:
116       opus_app = OPUS_APPLICATION_AUDIO;
117       break;
118     default:
119       return -1;
120   }
121 
122   OpusEncInst* state =
123       reinterpret_cast<OpusEncInst*>(calloc(1, sizeof(OpusEncInst)));
124   RTC_DCHECK(state);
125 
126   int error;
127   state->encoder = opus_encoder_create(
128       sample_rate_hz, static_cast<int>(channels), opus_app, &error);
129 
130   if (error != OPUS_OK || (!state->encoder && !state->multistream_encoder)) {
131     WebRtcOpus_EncoderFree(state);
132     return -1;
133   }
134 
135   state->in_dtx_mode = 0;
136   state->channels = channels;
137   state->sample_rate_hz = sample_rate_hz;
138   state->smooth_energy_non_active_frames = 0.0f;
139   state->avoid_noise_pumping_during_dtx =
140       webrtc::field_trial::IsEnabled(kAvoidNoisePumpingDuringDtxFieldTrial);
141 
142   *inst = state;
143   return 0;
144 }
145 
WebRtcOpus_MultistreamEncoderCreate(OpusEncInst ** inst,size_t channels,int32_t application,size_t streams,size_t coupled_streams,const unsigned char * channel_mapping)146 int16_t WebRtcOpus_MultistreamEncoderCreate(
147     OpusEncInst** inst,
148     size_t channels,
149     int32_t application,
150     size_t streams,
151     size_t coupled_streams,
152     const unsigned char* channel_mapping) {
153   int opus_app;
154   if (!inst)
155     return -1;
156 
157   switch (application) {
158     case 0:
159       opus_app = OPUS_APPLICATION_VOIP;
160       break;
161     case 1:
162       opus_app = OPUS_APPLICATION_AUDIO;
163       break;
164     default:
165       return -1;
166   }
167 
168   OpusEncInst* state =
169       reinterpret_cast<OpusEncInst*>(calloc(1, sizeof(OpusEncInst)));
170   RTC_DCHECK(state);
171 
172   int error;
173   const int sample_rate_hz = 48000;
174   state->multistream_encoder = opus_multistream_encoder_create(
175       sample_rate_hz, channels, streams, coupled_streams, channel_mapping,
176       opus_app, &error);
177 
178   if (error != OPUS_OK || (!state->encoder && !state->multistream_encoder)) {
179     WebRtcOpus_EncoderFree(state);
180     return -1;
181   }
182 
183   state->in_dtx_mode = 0;
184   state->channels = channels;
185   state->sample_rate_hz = sample_rate_hz;
186   state->smooth_energy_non_active_frames = 0.0f;
187   state->avoid_noise_pumping_during_dtx = false;
188 
189   *inst = state;
190   return 0;
191 }
192 
WebRtcOpus_EncoderFree(OpusEncInst * inst)193 int16_t WebRtcOpus_EncoderFree(OpusEncInst* inst) {
194   if (inst) {
195     if (inst->encoder) {
196       opus_encoder_destroy(inst->encoder);
197     } else {
198       opus_multistream_encoder_destroy(inst->multistream_encoder);
199     }
200     free(inst);
201     return 0;
202   } else {
203     return -1;
204   }
205 }
206 
WebRtcOpus_Encode(OpusEncInst * inst,const int16_t * audio_in,size_t samples,size_t length_encoded_buffer,uint8_t * encoded)207 int WebRtcOpus_Encode(OpusEncInst* inst,
208                       const int16_t* audio_in,
209                       size_t samples,
210                       size_t length_encoded_buffer,
211                       uint8_t* encoded) {
212   int res;
213 
214   if (samples > 48 * kWebRtcOpusMaxEncodeFrameSizeMs) {
215     return -1;
216   }
217 
218   if (inst->encoder) {
219     res = opus_encode(inst->encoder, (const opus_int16*)audio_in,
220                       static_cast<int>(samples), encoded,
221                       static_cast<opus_int32>(length_encoded_buffer));
222   } else {
223     res = opus_multistream_encode(
224         inst->multistream_encoder, (const opus_int16*)audio_in,
225         static_cast<int>(samples), encoded,
226         static_cast<opus_int32>(length_encoded_buffer));
227   }
228 
229   if (res <= 0) {
230     return -1;
231   }
232 
233   if (res <= 2) {
234     // Indicates DTX since the packet has nothing but a header. In principle,
235     // there is no need to send this packet. However, we do transmit the first
236     // occurrence to let the decoder know that the encoder enters DTX mode.
237     if (inst->in_dtx_mode) {
238       return 0;
239     } else {
240       inst->in_dtx_mode = 1;
241       return res;
242     }
243   }
244 
245   if (inst->avoid_noise_pumping_during_dtx && WebRtcOpus_GetUseDtx(inst) == 1 &&
246       WebRtcOpus_IsHighEnergyRefreshDtxPacket(
247           inst, rtc::MakeArrayView(audio_in, samples),
248           rtc::MakeArrayView(encoded, res))) {
249     // This packet is a high energy refresh DTX packet. For avoiding an increase
250     // of the energy in the DTX region at the decoder, this packet is
251     // substituted by a TOC byte with one empty frame.
252     // The number of frames described in the TOC byte
253     // (https://tools.ietf.org/html/rfc6716#section-3.1) are overwritten to
254     // always indicate one frame (last two bits equal to 0).
255     encoded[0] = encoded[0] & 0b11111100;
256     inst->in_dtx_mode = 1;
257     // The payload is just the TOC byte and has 1 byte as length.
258     return 1;
259   }
260   inst->in_dtx_mode = 0;
261   return res;
262 }
263 
264 #define ENCODER_CTL(inst, vargs)                \
265   (inst->encoder                                \
266        ? opus_encoder_ctl(inst->encoder, vargs) \
267        : opus_multistream_encoder_ctl(inst->multistream_encoder, vargs))
268 
WebRtcOpus_SetBitRate(OpusEncInst * inst,int32_t rate)269 int16_t WebRtcOpus_SetBitRate(OpusEncInst* inst, int32_t rate) {
270   if (inst) {
271     return ENCODER_CTL(inst, OPUS_SET_BITRATE(rate));
272   } else {
273     return -1;
274   }
275 }
276 
WebRtcOpus_SetPacketLossRate(OpusEncInst * inst,int32_t loss_rate)277 int16_t WebRtcOpus_SetPacketLossRate(OpusEncInst* inst, int32_t loss_rate) {
278   if (inst) {
279     return ENCODER_CTL(inst, OPUS_SET_PACKET_LOSS_PERC(loss_rate));
280   } else {
281     return -1;
282   }
283 }
284 
WebRtcOpus_SetMaxPlaybackRate(OpusEncInst * inst,int32_t frequency_hz)285 int16_t WebRtcOpus_SetMaxPlaybackRate(OpusEncInst* inst, int32_t frequency_hz) {
286   opus_int32 set_bandwidth;
287 
288   if (!inst)
289     return -1;
290 
291   if (frequency_hz <= 8000) {
292     set_bandwidth = OPUS_BANDWIDTH_NARROWBAND;
293   } else if (frequency_hz <= 12000) {
294     set_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND;
295   } else if (frequency_hz <= 16000) {
296     set_bandwidth = OPUS_BANDWIDTH_WIDEBAND;
297   } else if (frequency_hz <= 24000) {
298     set_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND;
299   } else {
300     set_bandwidth = OPUS_BANDWIDTH_FULLBAND;
301   }
302   return ENCODER_CTL(inst, OPUS_SET_MAX_BANDWIDTH(set_bandwidth));
303 }
304 
WebRtcOpus_GetMaxPlaybackRate(OpusEncInst * const inst,int32_t * result_hz)305 int16_t WebRtcOpus_GetMaxPlaybackRate(OpusEncInst* const inst,
306                                       int32_t* result_hz) {
307   if (inst->encoder) {
308     if (opus_encoder_ctl(inst->encoder, OPUS_GET_MAX_BANDWIDTH(result_hz)) ==
309         OPUS_OK) {
310       return 0;
311     }
312     return -1;
313   }
314 
315   opus_int32 max_bandwidth;
316   int s;
317   int ret;
318 
319   max_bandwidth = 0;
320   ret = OPUS_OK;
321   s = 0;
322   while (ret == OPUS_OK) {
323     OpusEncoder* enc;
324     opus_int32 bandwidth;
325 
326     ret = ENCODER_CTL(inst, OPUS_MULTISTREAM_GET_ENCODER_STATE(s, &enc));
327     if (ret == OPUS_BAD_ARG)
328       break;
329     if (ret != OPUS_OK)
330       return -1;
331     if (opus_encoder_ctl(enc, OPUS_GET_MAX_BANDWIDTH(&bandwidth)) != OPUS_OK)
332       return -1;
333 
334     if (max_bandwidth != 0 && max_bandwidth != bandwidth)
335       return -1;
336 
337     max_bandwidth = bandwidth;
338     s++;
339   }
340   *result_hz = max_bandwidth;
341   return 0;
342 }
343 
WebRtcOpus_EnableFec(OpusEncInst * inst)344 int16_t WebRtcOpus_EnableFec(OpusEncInst* inst) {
345   if (inst) {
346     return ENCODER_CTL(inst, OPUS_SET_INBAND_FEC(1));
347   } else {
348     return -1;
349   }
350 }
351 
WebRtcOpus_DisableFec(OpusEncInst * inst)352 int16_t WebRtcOpus_DisableFec(OpusEncInst* inst) {
353   if (inst) {
354     return ENCODER_CTL(inst, OPUS_SET_INBAND_FEC(0));
355   } else {
356     return -1;
357   }
358 }
359 
WebRtcOpus_EnableDtx(OpusEncInst * inst)360 int16_t WebRtcOpus_EnableDtx(OpusEncInst* inst) {
361   if (!inst) {
362     return -1;
363   }
364 
365   // To prevent Opus from entering CELT-only mode by forcing signal type to
366   // voice to make sure that DTX behaves correctly. Currently, DTX does not
367   // last long during a pure silence, if the signal type is not forced.
368   // TODO(minyue): Remove the signal type forcing when Opus DTX works properly
369   // without it.
370   int ret = ENCODER_CTL(inst, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
371   if (ret != OPUS_OK)
372     return ret;
373 
374   return ENCODER_CTL(inst, OPUS_SET_DTX(1));
375 }
376 
WebRtcOpus_DisableDtx(OpusEncInst * inst)377 int16_t WebRtcOpus_DisableDtx(OpusEncInst* inst) {
378   if (inst) {
379     int ret = ENCODER_CTL(inst, OPUS_SET_SIGNAL(OPUS_AUTO));
380     if (ret != OPUS_OK)
381       return ret;
382     return ENCODER_CTL(inst, OPUS_SET_DTX(0));
383   } else {
384     return -1;
385   }
386 }
387 
WebRtcOpus_GetUseDtx(OpusEncInst * inst)388 int16_t WebRtcOpus_GetUseDtx(OpusEncInst* inst) {
389   if (inst) {
390     opus_int32 use_dtx;
391     if (ENCODER_CTL(inst, OPUS_GET_DTX(&use_dtx)) == 0) {
392       return use_dtx;
393     }
394   }
395   return -1;
396 }
397 
WebRtcOpus_EnableCbr(OpusEncInst * inst)398 int16_t WebRtcOpus_EnableCbr(OpusEncInst* inst) {
399   if (inst) {
400     return ENCODER_CTL(inst, OPUS_SET_VBR(0));
401   } else {
402     return -1;
403   }
404 }
405 
WebRtcOpus_DisableCbr(OpusEncInst * inst)406 int16_t WebRtcOpus_DisableCbr(OpusEncInst* inst) {
407   if (inst) {
408     return ENCODER_CTL(inst, OPUS_SET_VBR(1));
409   } else {
410     return -1;
411   }
412 }
413 
WebRtcOpus_SetComplexity(OpusEncInst * inst,int32_t complexity)414 int16_t WebRtcOpus_SetComplexity(OpusEncInst* inst, int32_t complexity) {
415   if (inst) {
416     return ENCODER_CTL(inst, OPUS_SET_COMPLEXITY(complexity));
417   } else {
418     return -1;
419   }
420 }
421 
WebRtcOpus_GetBandwidth(OpusEncInst * inst)422 int32_t WebRtcOpus_GetBandwidth(OpusEncInst* inst) {
423   if (!inst) {
424     return -1;
425   }
426   int32_t bandwidth;
427   if (ENCODER_CTL(inst, OPUS_GET_BANDWIDTH(&bandwidth)) == 0) {
428     return bandwidth;
429   } else {
430     return -1;
431   }
432 }
433 
WebRtcOpus_SetBandwidth(OpusEncInst * inst,int32_t bandwidth)434 int16_t WebRtcOpus_SetBandwidth(OpusEncInst* inst, int32_t bandwidth) {
435   if (inst) {
436     return ENCODER_CTL(inst, OPUS_SET_BANDWIDTH(bandwidth));
437   } else {
438     return -1;
439   }
440 }
441 
WebRtcOpus_SetForceChannels(OpusEncInst * inst,size_t num_channels)442 int16_t WebRtcOpus_SetForceChannels(OpusEncInst* inst, size_t num_channels) {
443   if (!inst)
444     return -1;
445   if (num_channels == 0) {
446     return ENCODER_CTL(inst, OPUS_SET_FORCE_CHANNELS(OPUS_AUTO));
447   } else if (num_channels == 1 || num_channels == 2) {
448     return ENCODER_CTL(inst, OPUS_SET_FORCE_CHANNELS(num_channels));
449   } else {
450     return -1;
451   }
452 }
453 
WebRtcOpus_GetInDtx(OpusEncInst * inst)454 int32_t WebRtcOpus_GetInDtx(OpusEncInst* inst) {
455   if (!inst) {
456     return -1;
457   }
458 #ifdef OPUS_GET_IN_DTX
459   int32_t in_dtx;
460   if (ENCODER_CTL(inst, OPUS_GET_IN_DTX(&in_dtx)) == 0) {
461     return in_dtx;
462   }
463 #endif
464   return -1;
465 }
466 
WebRtcOpus_DecoderCreate(OpusDecInst ** inst,size_t channels,int sample_rate_hz)467 int16_t WebRtcOpus_DecoderCreate(OpusDecInst** inst,
468                                  size_t channels,
469                                  int sample_rate_hz) {
470   int error;
471   OpusDecInst* state;
472 
473   if (inst != NULL) {
474     // Create Opus decoder state.
475     state = reinterpret_cast<OpusDecInst*>(calloc(1, sizeof(OpusDecInst)));
476     if (state == NULL) {
477       return -1;
478     }
479 
480     state->decoder =
481         opus_decoder_create(sample_rate_hz, static_cast<int>(channels), &error);
482     if (error == OPUS_OK && state->decoder) {
483       // Creation of memory all ok.
484       state->channels = channels;
485       state->sample_rate_hz = sample_rate_hz;
486       state->plc_use_prev_decoded_samples =
487           webrtc::field_trial::IsEnabled(kPlcUsePrevDecodedSamplesFieldTrial);
488       if (state->plc_use_prev_decoded_samples) {
489         state->prev_decoded_samples =
490             DefaultFrameSizePerChannel(state->sample_rate_hz);
491       }
492       state->in_dtx_mode = 0;
493       *inst = state;
494       return 0;
495     }
496 
497     // If memory allocation was unsuccessful, free the entire state.
498     if (state->decoder) {
499       opus_decoder_destroy(state->decoder);
500     }
501     free(state);
502   }
503   return -1;
504 }
505 
WebRtcOpus_MultistreamDecoderCreate(OpusDecInst ** inst,size_t channels,size_t streams,size_t coupled_streams,const unsigned char * channel_mapping)506 int16_t WebRtcOpus_MultistreamDecoderCreate(
507     OpusDecInst** inst,
508     size_t channels,
509     size_t streams,
510     size_t coupled_streams,
511     const unsigned char* channel_mapping) {
512   int error;
513   OpusDecInst* state;
514 
515   if (inst != NULL) {
516     // Create Opus decoder state.
517     state = reinterpret_cast<OpusDecInst*>(calloc(1, sizeof(OpusDecInst)));
518     if (state == NULL) {
519       return -1;
520     }
521 
522     // Create new memory, always at 48000 Hz.
523     state->multistream_decoder = opus_multistream_decoder_create(
524         48000, channels, streams, coupled_streams, channel_mapping, &error);
525 
526     if (error == OPUS_OK && state->multistream_decoder) {
527       // Creation of memory all ok.
528       state->channels = channels;
529       state->sample_rate_hz = 48000;
530       state->plc_use_prev_decoded_samples =
531           webrtc::field_trial::IsEnabled(kPlcUsePrevDecodedSamplesFieldTrial);
532       if (state->plc_use_prev_decoded_samples) {
533         state->prev_decoded_samples =
534             DefaultFrameSizePerChannel(state->sample_rate_hz);
535       }
536       state->in_dtx_mode = 0;
537       *inst = state;
538       return 0;
539     }
540 
541     // If memory allocation was unsuccessful, free the entire state.
542     opus_multistream_decoder_destroy(state->multistream_decoder);
543     free(state);
544   }
545   return -1;
546 }
547 
WebRtcOpus_DecoderFree(OpusDecInst * inst)548 int16_t WebRtcOpus_DecoderFree(OpusDecInst* inst) {
549   if (inst) {
550     if (inst->decoder) {
551       opus_decoder_destroy(inst->decoder);
552     } else if (inst->multistream_decoder) {
553       opus_multistream_decoder_destroy(inst->multistream_decoder);
554     }
555     free(inst);
556     return 0;
557   } else {
558     return -1;
559   }
560 }
561 
WebRtcOpus_DecoderChannels(OpusDecInst * inst)562 size_t WebRtcOpus_DecoderChannels(OpusDecInst* inst) {
563   return inst->channels;
564 }
565 
WebRtcOpus_DecoderInit(OpusDecInst * inst)566 void WebRtcOpus_DecoderInit(OpusDecInst* inst) {
567   if (inst->decoder) {
568     opus_decoder_ctl(inst->decoder, OPUS_RESET_STATE);
569   } else {
570     opus_multistream_decoder_ctl(inst->multistream_decoder, OPUS_RESET_STATE);
571   }
572   inst->in_dtx_mode = 0;
573 }
574 
575 /* For decoder to determine if it is to output speech or comfort noise. */
DetermineAudioType(OpusDecInst * inst,size_t encoded_bytes)576 static int16_t DetermineAudioType(OpusDecInst* inst, size_t encoded_bytes) {
577   // Audio type becomes comfort noise if `encoded_byte` is 1 and keeps
578   // to be so if the following `encoded_byte` are 0 or 1.
579   if (encoded_bytes == 0 && inst->in_dtx_mode) {
580     return 2;  // Comfort noise.
581   } else if (encoded_bytes == 1 || encoded_bytes == 2) {
582     // TODO(henrik.lundin): There is a slight risk that a 2-byte payload is in
583     // fact a 1-byte TOC with a 1-byte payload. That will be erroneously
584     // interpreted as comfort noise output, but such a payload is probably
585     // faulty anyway.
586 
587     // TODO(webrtc:10218): This is wrong for multistream opus. Then are several
588     // single-stream packets glued together with some packet size bytes in
589     // between. See https://tools.ietf.org/html/rfc6716#appendix-B
590     inst->in_dtx_mode = 1;
591     return 2;  // Comfort noise.
592   } else {
593     inst->in_dtx_mode = 0;
594     return 0;  // Speech.
595   }
596 }
597 
598 /* `frame_size` is set to maximum Opus frame size in the normal case, and
599  * is set to the number of samples needed for PLC in case of losses.
600  * It is up to the caller to make sure the value is correct. */
DecodeNative(OpusDecInst * inst,const uint8_t * encoded,size_t encoded_bytes,int frame_size,int16_t * decoded,int16_t * audio_type,int decode_fec)601 static int DecodeNative(OpusDecInst* inst,
602                         const uint8_t* encoded,
603                         size_t encoded_bytes,
604                         int frame_size,
605                         int16_t* decoded,
606                         int16_t* audio_type,
607                         int decode_fec) {
608   int res = -1;
609   if (inst->decoder) {
610     res = opus_decode(
611         inst->decoder, encoded, static_cast<opus_int32>(encoded_bytes),
612         reinterpret_cast<opus_int16*>(decoded), frame_size, decode_fec);
613   } else {
614     res = opus_multistream_decode(inst->multistream_decoder, encoded,
615                                   static_cast<opus_int32>(encoded_bytes),
616                                   reinterpret_cast<opus_int16*>(decoded),
617                                   frame_size, decode_fec);
618   }
619 
620   if (res <= 0)
621     return -1;
622 
623   *audio_type = DetermineAudioType(inst, encoded_bytes);
624 
625   return res;
626 }
627 
DecodePlc(OpusDecInst * inst,int16_t * decoded)628 static int DecodePlc(OpusDecInst* inst, int16_t* decoded) {
629   int16_t audio_type = 0;
630   int decoded_samples;
631   int plc_samples =
632       FrameSizePerChannel(kWebRtcOpusPlcFrameSizeMs, inst->sample_rate_hz);
633 
634   if (inst->plc_use_prev_decoded_samples) {
635     /* The number of samples we ask for is `number_of_lost_frames` times
636      * `prev_decoded_samples_`. Limit the number of samples to maximum
637      * `MaxFrameSizePerChannel()`. */
638     plc_samples = inst->prev_decoded_samples;
639     const int max_samples_per_channel =
640         MaxFrameSizePerChannel(inst->sample_rate_hz);
641     plc_samples = plc_samples <= max_samples_per_channel
642                       ? plc_samples
643                       : max_samples_per_channel;
644   }
645   decoded_samples =
646       DecodeNative(inst, NULL, 0, plc_samples, decoded, &audio_type, 0);
647   if (decoded_samples < 0) {
648     return -1;
649   }
650 
651   return decoded_samples;
652 }
653 
WebRtcOpus_Decode(OpusDecInst * inst,const uint8_t * encoded,size_t encoded_bytes,int16_t * decoded,int16_t * audio_type)654 int WebRtcOpus_Decode(OpusDecInst* inst,
655                       const uint8_t* encoded,
656                       size_t encoded_bytes,
657                       int16_t* decoded,
658                       int16_t* audio_type) {
659   int decoded_samples;
660 
661   if (encoded_bytes == 0) {
662     *audio_type = DetermineAudioType(inst, encoded_bytes);
663     decoded_samples = DecodePlc(inst, decoded);
664   } else {
665     decoded_samples = DecodeNative(inst, encoded, encoded_bytes,
666                                    MaxFrameSizePerChannel(inst->sample_rate_hz),
667                                    decoded, audio_type, 0);
668   }
669   if (decoded_samples < 0) {
670     return -1;
671   }
672 
673   if (inst->plc_use_prev_decoded_samples) {
674     /* Update decoded sample memory, to be used by the PLC in case of losses. */
675     inst->prev_decoded_samples = decoded_samples;
676   }
677 
678   return decoded_samples;
679 }
680 
WebRtcOpus_DecodeFec(OpusDecInst * inst,const uint8_t * encoded,size_t encoded_bytes,int16_t * decoded,int16_t * audio_type)681 int WebRtcOpus_DecodeFec(OpusDecInst* inst,
682                          const uint8_t* encoded,
683                          size_t encoded_bytes,
684                          int16_t* decoded,
685                          int16_t* audio_type) {
686   int decoded_samples;
687   int fec_samples;
688 
689   if (WebRtcOpus_PacketHasFec(encoded, encoded_bytes) != 1) {
690     return 0;
691   }
692 
693   fec_samples =
694       opus_packet_get_samples_per_frame(encoded, inst->sample_rate_hz);
695 
696   decoded_samples = DecodeNative(inst, encoded, encoded_bytes, fec_samples,
697                                  decoded, audio_type, 1);
698   if (decoded_samples < 0) {
699     return -1;
700   }
701 
702   return decoded_samples;
703 }
704 
WebRtcOpus_DurationEst(OpusDecInst * inst,const uint8_t * payload,size_t payload_length_bytes)705 int WebRtcOpus_DurationEst(OpusDecInst* inst,
706                            const uint8_t* payload,
707                            size_t payload_length_bytes) {
708   if (payload_length_bytes == 0) {
709     // WebRtcOpus_Decode calls PLC when payload length is zero. So we return
710     // PLC duration correspondingly.
711     return WebRtcOpus_PlcDuration(inst);
712   }
713 
714   int frames, samples;
715   frames = opus_packet_get_nb_frames(
716       payload, static_cast<opus_int32>(payload_length_bytes));
717   if (frames < 0) {
718     /* Invalid payload data. */
719     return 0;
720   }
721   samples =
722       frames * opus_packet_get_samples_per_frame(payload, inst->sample_rate_hz);
723   if (samples > 120 * inst->sample_rate_hz / 1000) {
724     // More than 120 ms' worth of samples.
725     return 0;
726   }
727   return samples;
728 }
729 
WebRtcOpus_PlcDuration(OpusDecInst * inst)730 int WebRtcOpus_PlcDuration(OpusDecInst* inst) {
731   if (inst->plc_use_prev_decoded_samples) {
732     /* The number of samples we ask for is `number_of_lost_frames` times
733      * `prev_decoded_samples_`. Limit the number of samples to maximum
734      * `MaxFrameSizePerChannel()`. */
735     const int plc_samples = inst->prev_decoded_samples;
736     const int max_samples_per_channel =
737         MaxFrameSizePerChannel(inst->sample_rate_hz);
738     return plc_samples <= max_samples_per_channel ? plc_samples
739                                                   : max_samples_per_channel;
740   }
741   return FrameSizePerChannel(kWebRtcOpusPlcFrameSizeMs, inst->sample_rate_hz);
742 }
743 
WebRtcOpus_FecDurationEst(const uint8_t * payload,size_t payload_length_bytes,int sample_rate_hz)744 int WebRtcOpus_FecDurationEst(const uint8_t* payload,
745                               size_t payload_length_bytes,
746                               int sample_rate_hz) {
747   if (WebRtcOpus_PacketHasFec(payload, payload_length_bytes) != 1) {
748     return 0;
749   }
750   const int samples =
751       opus_packet_get_samples_per_frame(payload, sample_rate_hz);
752   const int samples_per_ms = sample_rate_hz / 1000;
753   if (samples < 10 * samples_per_ms || samples > 120 * samples_per_ms) {
754     /* Invalid payload duration. */
755     return 0;
756   }
757   return samples;
758 }
759 
WebRtcOpus_NumSilkFrames(const uint8_t * payload)760 int WebRtcOpus_NumSilkFrames(const uint8_t* payload) {
761   // For computing the payload length in ms, the sample rate is not important
762   // since it cancels out. We use 48 kHz, but any valid sample rate would work.
763   int payload_length_ms =
764       opus_packet_get_samples_per_frame(payload, 48000) / 48;
765   if (payload_length_ms < 10)
766     payload_length_ms = 10;
767 
768   int silk_frames;
769   switch (payload_length_ms) {
770     case 10:
771     case 20:
772       silk_frames = 1;
773       break;
774     case 40:
775       silk_frames = 2;
776       break;
777     case 60:
778       silk_frames = 3;
779       break;
780     default:
781       return 0;  // It is actually even an invalid packet.
782   }
783   return silk_frames;
784 }
785 
786 // This method is based on Definition of the Opus Audio Codec
787 // (https://tools.ietf.org/html/rfc6716). Basically, this method is based on
788 // parsing the LP layer of an Opus packet, particularly the LBRR flag.
WebRtcOpus_PacketHasFec(const uint8_t * payload,size_t payload_length_bytes)789 int WebRtcOpus_PacketHasFec(const uint8_t* payload,
790                             size_t payload_length_bytes) {
791   if (payload == NULL || payload_length_bytes == 0)
792     return 0;
793 
794   // In CELT_ONLY mode, packets should not have FEC.
795   if (payload[0] & 0x80)
796     return 0;
797 
798   int silk_frames = WebRtcOpus_NumSilkFrames(payload);
799   if (silk_frames == 0)
800     return 0;  // Not valid.
801 
802   const int channels = opus_packet_get_nb_channels(payload);
803   RTC_DCHECK(channels == 1 || channels == 2);
804 
805   // Max number of frames in an Opus packet is 48.
806   opus_int16 frame_sizes[48];
807   const unsigned char* frame_data[48];
808 
809   // Parse packet to get the frames. But we only care about the first frame,
810   // since we can only decode the FEC from the first one.
811   if (opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
812                         NULL, frame_data, frame_sizes, NULL) < 0) {
813     return 0;
814   }
815 
816   if (frame_sizes[0] < 1) {
817     return 0;
818   }
819 
820   // A frame starts with the LP layer. The LP layer begins with two to eight
821   // header bits.These consist of one VAD bit per SILK frame (up to 3),
822   // followed by a single flag indicating the presence of LBRR frames.
823   // For a stereo packet, these first flags correspond to the mid channel, and
824   // a second set of flags is included for the side channel. Because these are
825   // the first symbols decoded by the range coder and because they are coded
826   // as binary values with uniform probability, they can be extracted directly
827   // from the most significant bits of the first byte of compressed data.
828   for (int n = 0; n < channels; n++) {
829     // The LBRR bit for channel 1 is on the (`silk_frames` + 1)-th bit, and
830     // that of channel 2 is on the |(`silk_frames` + 1) * 2 + 1|-th bit.
831     if (frame_data[0][0] & (0x80 >> ((n + 1) * (silk_frames + 1) - 1)))
832       return 1;
833   }
834 
835   return 0;
836 }
837 
WebRtcOpus_PacketHasVoiceActivity(const uint8_t * payload,size_t payload_length_bytes)838 int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload,
839                                       size_t payload_length_bytes) {
840   if (payload == NULL || payload_length_bytes == 0)
841     return 0;
842 
843   // In CELT_ONLY mode we can not determine whether there is VAD.
844   if (payload[0] & 0x80)
845     return -1;
846 
847   int silk_frames = WebRtcOpus_NumSilkFrames(payload);
848   if (silk_frames == 0)
849     return -1;
850 
851   const int channels = opus_packet_get_nb_channels(payload);
852   RTC_DCHECK(channels == 1 || channels == 2);
853 
854   // Max number of frames in an Opus packet is 48.
855   opus_int16 frame_sizes[48];
856   const unsigned char* frame_data[48];
857 
858   // Parse packet to get the frames.
859   int frames =
860       opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
861                         NULL, frame_data, frame_sizes, NULL);
862   if (frames < 0)
863     return -1;
864 
865   // Iterate over all Opus frames which may contain multiple SILK frames.
866   for (int frame = 0; frame < frames; frame++) {
867     if (frame_sizes[frame] < 1) {
868       continue;
869     }
870     if (frame_data[frame][0] >> (8 - silk_frames))
871       return 1;
872     if (channels == 2 &&
873         (frame_data[frame][0] << (silk_frames + 1)) >> (8 - silk_frames))
874       return 1;
875   }
876 
877   return 0;
878 }
879