• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_
12 #define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_
13 
14 #include "typedefs.h"
15 #include "module.h"
16 
17 namespace webrtc {
18 
19 class AudioFrame;
20 class EchoCancellation;
21 class EchoControlMobile;
22 class GainControl;
23 class HighPassFilter;
24 class LevelEstimator;
25 class NoiseSuppression;
26 class VoiceDetection;
27 
28 // The Audio Processing Module (APM) provides a collection of voice processing
29 // components designed for real-time communications software.
30 //
31 // APM operates on two audio streams on a frame-by-frame basis. Frames of the
32 // primary stream, on which all processing is applied, are passed to
33 // |ProcessStream()|. Frames of the reverse direction stream, which are used for
34 // analysis by some components, are passed to |AnalyzeReverseStream()|. On the
35 // client-side, this will typically be the near-end (capture) and far-end
36 // (render) streams, respectively. APM should be placed in the signal chain as
37 // close to the audio hardware abstraction layer (HAL) as possible.
38 //
39 // On the server-side, the reverse stream will normally not be used, with
40 // processing occurring on each incoming stream.
41 //
42 // Component interfaces follow a similar pattern and are accessed through
43 // corresponding getters in APM. All components are disabled at create-time,
44 // with default settings that are recommended for most situations. New settings
45 // can be applied without enabling a component. Enabling a component triggers
46 // memory allocation and initialization to allow it to start processing the
47 // streams.
48 //
49 // Thread safety is provided with the following assumptions to reduce locking
50 // overhead:
51 //   1. The stream getters and setters are called from the same thread as
52 //      ProcessStream(). More precisely, stream functions are never called
53 //      concurrently with ProcessStream().
54 //   2. Parameter getters are never called concurrently with the corresponding
55 //      setter.
56 //
57 // APM accepts only 16-bit linear PCM audio data in frames of 10 ms. Multiple
58 // channels should be interleaved.
59 //
60 // Usage example, omitting error checking:
61 // AudioProcessing* apm = AudioProcessing::Create(0);
62 // apm->set_sample_rate_hz(32000); // Super-wideband processing.
63 //
64 // // Mono capture and stereo render.
65 // apm->set_num_channels(1, 1);
66 // apm->set_num_reverse_channels(2);
67 //
68 // apm->high_pass_filter()->Enable(true);
69 //
70 // apm->echo_cancellation()->enable_drift_compensation(false);
71 // apm->echo_cancellation()->Enable(true);
72 //
73 // apm->noise_reduction()->set_level(kHighSuppression);
74 // apm->noise_reduction()->Enable(true);
75 //
76 // apm->gain_control()->set_analog_level_limits(0, 255);
77 // apm->gain_control()->set_mode(kAdaptiveAnalog);
78 // apm->gain_control()->Enable(true);
79 //
80 // apm->voice_detection()->Enable(true);
81 //
82 // // Start a voice call...
83 //
84 // // ... Render frame arrives bound for the audio HAL ...
85 // apm->AnalyzeReverseStream(render_frame);
86 //
87 // // ... Capture frame arrives from the audio HAL ...
88 // // Call required set_stream_ functions.
89 // apm->set_stream_delay_ms(delay_ms);
90 // apm->gain_control()->set_stream_analog_level(analog_level);
91 //
92 // apm->ProcessStream(capture_frame);
93 //
94 // // Call required stream_ functions.
95 // analog_level = apm->gain_control()->stream_analog_level();
96 // has_voice = apm->stream_has_voice();
97 //
98 // // Repeate render and capture processing for the duration of the call...
99 // // Start a new call...
100 // apm->Initialize();
101 //
102 // // Close the application...
103 // AudioProcessing::Destroy(apm);
104 // apm = NULL;
105 //
106 class AudioProcessing : public Module {
107  public:
108   // Creates a APM instance, with identifier |id|. Use one instance for every
109   // primary audio stream requiring processing. On the client-side, this would
110   // typically be one instance for the near-end stream, and additional instances
111   // for each far-end stream which requires processing. On the server-side,
112   // this would typically be one instance for every incoming stream.
113   static AudioProcessing* Create(int id);
114 
115   // Destroys a |apm| instance.
116   static void Destroy(AudioProcessing* apm);
117 
118   // Initializes internal states, while retaining all user settings. This
119   // should be called before beginning to process a new audio stream. However,
120   // it is not necessary to call before processing the first stream after
121   // creation.
122   virtual int Initialize() = 0;
123 
124   // Sets the sample |rate| in Hz for both the primary and reverse audio
125   // streams. 8000, 16000 or 32000 Hz are permitted.
126   virtual int set_sample_rate_hz(int rate) = 0;
127   virtual int sample_rate_hz() const = 0;
128 
129   // Sets the number of channels for the primary audio stream. Input frames must
130   // contain a number of channels given by |input_channels|, while output frames
131   // will be returned with number of channels given by |output_channels|.
132   virtual int set_num_channels(int input_channels, int output_channels) = 0;
133   virtual int num_input_channels() const = 0;
134   virtual int num_output_channels() const = 0;
135 
136   // Sets the number of channels for the reverse audio stream. Input frames must
137   // contain a number of channels given by |channels|.
138   virtual int set_num_reverse_channels(int channels) = 0;
139   virtual int num_reverse_channels() const = 0;
140 
141   // Processes a 10 ms |frame| of the primary audio stream. On the client-side,
142   // this is the near-end (or captured) audio.
143   //
144   // If needed for enabled functionality, any function with the set_stream_ tag
145   // must be called prior to processing the current frame. Any getter function
146   // with the stream_ tag which is needed should be called after processing.
147   //
148   // The |_frequencyInHz|, |_audioChannel|, and |_payloadDataLengthInSamples|
149   // members of |frame| must be valid, and correspond to settings supplied
150   // to APM.
151   virtual int ProcessStream(AudioFrame* frame) = 0;
152 
153   // Analyzes a 10 ms |frame| of the reverse direction audio stream. The frame
154   // will not be modified. On the client-side, this is the far-end (or to be
155   // rendered) audio.
156   //
157   // It is only necessary to provide this if echo processing is enabled, as the
158   // reverse stream forms the echo reference signal. It is recommended, but not
159   // necessary, to provide if gain control is enabled. On the server-side this
160   // typically will not be used. If you're not sure what to pass in here,
161   // chances are you don't need to use it.
162   //
163   // The |_frequencyInHz|, |_audioChannel|, and |_payloadDataLengthInSamples|
164   // members of |frame| must be valid.
165   //
166   // TODO(ajm): add const to input; requires an implementation fix.
167   virtual int AnalyzeReverseStream(AudioFrame* frame) = 0;
168 
169   // This must be called if and only if echo processing is enabled.
170   //
171   // Sets the |delay| in ms between AnalyzeReverseStream() receiving a far-end
172   // frame and ProcessStream() receiving a near-end frame containing the
173   // corresponding echo. On the client-side this can be expressed as
174   //   delay = (t_render - t_analyze) + (t_process - t_capture)
175   // where,
176   //   - t_analyze is the time a frame is passed to AnalyzeReverseStream() and
177   //     t_render is the time the first sample of the same frame is rendered by
178   //     the audio hardware.
179   //   - t_capture is the time the first sample of a frame is captured by the
180   //     audio hardware and t_pull is the time the same frame is passed to
181   //     ProcessStream().
182   virtual int set_stream_delay_ms(int delay) = 0;
183   virtual int stream_delay_ms() const = 0;
184 
185   // Starts recording debugging information to a file specified by |filename|,
186   // a NULL-terminated string. If there is an ongoing recording, the old file
187   // will be closed, and recording will continue in the newly specified file.
188   // An already existing file will be overwritten without warning.
189   static const int kMaxFilenameSize = 1024;
190   virtual int StartDebugRecording(const char filename[kMaxFilenameSize]) = 0;
191 
192   // Stops recording debugging information, and closes the file. Recording
193   // cannot be resumed in the same file (without overwriting it).
194   virtual int StopDebugRecording() = 0;
195 
196   // These provide access to the component interfaces and should never return
197   // NULL. The pointers will be valid for the lifetime of the APM instance.
198   // The memory for these objects is entirely managed internally.
199   virtual EchoCancellation* echo_cancellation() const = 0;
200   virtual EchoControlMobile* echo_control_mobile() const = 0;
201   virtual GainControl* gain_control() const = 0;
202   virtual HighPassFilter* high_pass_filter() const = 0;
203   virtual LevelEstimator* level_estimator() const = 0;
204   virtual NoiseSuppression* noise_suppression() const = 0;
205   virtual VoiceDetection* voice_detection() const = 0;
206 
207   struct Statistic {
208     int instant;  // Instantaneous value.
209     int average;  // Long-term average.
210     int maximum;  // Long-term maximum.
211     int minimum;  // Long-term minimum.
212   };
213 
214   // Fatal errors.
215   enum Errors {
216     kNoError = 0,
217     kUnspecifiedError = -1,
218     kCreationFailedError = -2,
219     kUnsupportedComponentError = -3,
220     kUnsupportedFunctionError = -4,
221     kNullPointerError = -5,
222     kBadParameterError = -6,
223     kBadSampleRateError = -7,
224     kBadDataLengthError = -8,
225     kBadNumberChannelsError = -9,
226     kFileError = -10,
227     kStreamParameterNotSetError = -11,
228     kNotEnabledError = -12
229   };
230 
231   // Warnings are non-fatal.
232   enum Warnings {
233     // This results when a set_stream_ parameter is out of range. Processing
234     // will continue, but the parameter may have been truncated.
235     kBadStreamParameterWarning = -13,
236   };
237 
238   // Inherited from Module.
TimeUntilNextProcess()239   virtual WebRtc_Word32 TimeUntilNextProcess() { return -1; };
Process()240   virtual WebRtc_Word32 Process() { return -1; };
241 
242  protected:
~AudioProcessing()243   virtual ~AudioProcessing() {};
244 };
245 
246 // The acoustic echo cancellation (AEC) component provides better performance
247 // than AECM but also requires more processing power and is dependent on delay
248 // stability and reporting accuracy. As such it is well-suited and recommended
249 // for PC and IP phone applications.
250 //
251 // Not recommended to be enabled on the server-side.
252 class EchoCancellation {
253  public:
254   // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
255   // Enabling one will disable the other.
256   virtual int Enable(bool enable) = 0;
257   virtual bool is_enabled() const = 0;
258 
259   // Differences in clock speed on the primary and reverse streams can impact
260   // the AEC performance. On the client-side, this could be seen when different
261   // render and capture devices are used, particularly with webcams.
262   //
263   // This enables a compensation mechanism, and requires that
264   // |set_device_sample_rate_hz()| and |set_stream_drift_samples()| be called.
265   virtual int enable_drift_compensation(bool enable) = 0;
266   virtual bool is_drift_compensation_enabled() const = 0;
267 
268   // Provides the sampling rate of the audio devices. It is assumed the render
269   // and capture devices use the same nominal sample rate. Required if and only
270   // if drift compensation is enabled.
271   virtual int set_device_sample_rate_hz(int rate) = 0;
272   virtual int device_sample_rate_hz() const = 0;
273 
274   // Sets the difference between the number of samples rendered and captured by
275   // the audio devices since the last call to |ProcessStream()|. Must be called
276   // if and only if drift compensation is enabled, prior to |ProcessStream()|.
277   virtual int set_stream_drift_samples(int drift) = 0;
278   virtual int stream_drift_samples() const = 0;
279 
280   enum SuppressionLevel {
281     kLowSuppression,
282     kModerateSuppression,
283     kHighSuppression
284   };
285 
286   // Sets the aggressiveness of the suppressor. A higher level trades off
287   // double-talk performance for increased echo suppression.
288   virtual int set_suppression_level(SuppressionLevel level) = 0;
289   virtual SuppressionLevel suppression_level() const = 0;
290 
291   // Returns false if the current frame almost certainly contains no echo
292   // and true if it _might_ contain echo.
293   virtual bool stream_has_echo() const = 0;
294 
295   // Enables the computation of various echo metrics. These are obtained
296   // through |GetMetrics()|.
297   virtual int enable_metrics(bool enable) = 0;
298   virtual bool are_metrics_enabled() const = 0;
299 
300   // Each statistic is reported in dB.
301   // P_far:  Far-end (render) signal power.
302   // P_echo: Near-end (capture) echo signal power.
303   // P_out:  Signal power at the output of the AEC.
304   // P_a:    Internal signal power at the point before the AEC's non-linear
305   //         processor.
306   struct Metrics {
307     // RERL = ERL + ERLE
308     AudioProcessing::Statistic residual_echo_return_loss;
309 
310     // ERL = 10log_10(P_far / P_echo)
311     AudioProcessing::Statistic echo_return_loss;
312 
313     // ERLE = 10log_10(P_echo / P_out)
314     AudioProcessing::Statistic echo_return_loss_enhancement;
315 
316     // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a)
317     AudioProcessing::Statistic a_nlp;
318   };
319 
320   // TODO(ajm): discuss the metrics update period.
321   virtual int GetMetrics(Metrics* metrics) = 0;
322 
323  protected:
~EchoCancellation()324   virtual ~EchoCancellation() {};
325 };
326 
327 // The acoustic echo control for mobile (AECM) component is a low complexity
328 // robust option intended for use on mobile devices.
329 //
330 // Not recommended to be enabled on the server-side.
331 class EchoControlMobile {
332  public:
333   // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
334   // Enabling one will disable the other.
335   virtual int Enable(bool enable) = 0;
336   virtual bool is_enabled() const = 0;
337 
338   // Recommended settings for particular audio routes. In general, the louder
339   // the echo is expected to be, the higher this value should be set. The
340   // preferred setting may vary from device to device.
341   enum RoutingMode {
342     kQuietEarpieceOrHeadset,
343     kEarpiece,
344     kLoudEarpiece,
345     kSpeakerphone,
346     kLoudSpeakerphone
347   };
348 
349   // Sets echo control appropriate for the audio routing |mode| on the device.
350   // It can and should be updated during a call if the audio routing changes.
351   virtual int set_routing_mode(RoutingMode mode) = 0;
352   virtual RoutingMode routing_mode() const = 0;
353 
354   // Comfort noise replaces suppressed background noise to maintain a
355   // consistent signal level.
356   virtual int enable_comfort_noise(bool enable) = 0;
357   virtual bool is_comfort_noise_enabled() const = 0;
358 
359  protected:
~EchoControlMobile()360   virtual ~EchoControlMobile() {};
361 };
362 
363 // The automatic gain control (AGC) component brings the signal to an
364 // appropriate range. This is done by applying a digital gain directly and, in
365 // the analog mode, prescribing an analog gain to be applied at the audio HAL.
366 //
367 // Recommended to be enabled on the client-side.
368 class GainControl {
369  public:
370   virtual int Enable(bool enable) = 0;
371   virtual bool is_enabled() const = 0;
372 
373   // When an analog mode is set, this must be called prior to |ProcessStream()|
374   // to pass the current analog level from the audio HAL. Must be within the
375   // range provided to |set_analog_level_limits()|.
376   virtual int set_stream_analog_level(int level) = 0;
377 
378   // When an analog mode is set, this should be called after |ProcessStream()|
379   // to obtain the recommended new analog level for the audio HAL. It is the
380   // users responsibility to apply this level.
381   virtual int stream_analog_level() = 0;
382 
383   enum Mode {
384     // Adaptive mode intended for use if an analog volume control is available
385     // on the capture device. It will require the user to provide coupling
386     // between the OS mixer controls and AGC through the |stream_analog_level()|
387     // functions.
388     //
389     // It consists of an analog gain prescription for the audio device and a
390     // digital compression stage.
391     kAdaptiveAnalog,
392 
393     // Adaptive mode intended for situations in which an analog volume control
394     // is unavailable. It operates in a similar fashion to the adaptive analog
395     // mode, but with scaling instead applied in the digital domain. As with
396     // the analog mode, it additionally uses a digital compression stage.
397     kAdaptiveDigital,
398 
399     // Fixed mode which enables only the digital compression stage also used by
400     // the two adaptive modes.
401     //
402     // It is distinguished from the adaptive modes by considering only a
403     // short time-window of the input signal. It applies a fixed gain through
404     // most of the input level range, and compresses (gradually reduces gain
405     // with increasing level) the input signal at higher levels. This mode is
406     // preferred on embedded devices where the capture signal level is
407     // predictable, so that a known gain can be applied.
408     kFixedDigital
409   };
410 
411   virtual int set_mode(Mode mode) = 0;
412   virtual Mode mode() const = 0;
413 
414   // Sets the target peak |level| (or envelope) of the AGC in dBFs (decibels
415   // from digital full-scale). The convention is to use positive values. For
416   // instance, passing in a value of 3 corresponds to -3 dBFs, or a target
417   // level 3 dB below full-scale. Limited to [0, 31].
418   //
419   // TODO(ajm): use a negative value here instead, if/when VoE will similarly
420   //            update its interface.
421   virtual int set_target_level_dbfs(int level) = 0;
422   virtual int target_level_dbfs() const = 0;
423 
424   // Sets the maximum |gain| the digital compression stage may apply, in dB. A
425   // higher number corresponds to greater compression, while a value of 0 will
426   // leave the signal uncompressed. Limited to [0, 90].
427   virtual int set_compression_gain_db(int gain) = 0;
428   virtual int compression_gain_db() const = 0;
429 
430   // When enabled, the compression stage will hard limit the signal to the
431   // target level. Otherwise, the signal will be compressed but not limited
432   // above the target level.
433   virtual int enable_limiter(bool enable) = 0;
434   virtual bool is_limiter_enabled() const = 0;
435 
436   // Sets the |minimum| and |maximum| analog levels of the audio capture device.
437   // Must be set if and only if an analog mode is used. Limited to [0, 65535].
438   virtual int set_analog_level_limits(int minimum,
439                                       int maximum) = 0;
440   virtual int analog_level_minimum() const = 0;
441   virtual int analog_level_maximum() const = 0;
442 
443   // Returns true if the AGC has detected a saturation event (period where the
444   // signal reaches digital full-scale) in the current frame and the analog
445   // level cannot be reduced.
446   //
447   // This could be used as an indicator to reduce or disable analog mic gain at
448   // the audio HAL.
449   virtual bool stream_is_saturated() const = 0;
450 
451  protected:
~GainControl()452   virtual ~GainControl() {};
453 };
454 
455 // A filtering component which removes DC offset and low-frequency noise.
456 // Recommended to be enabled on the client-side.
457 class HighPassFilter {
458  public:
459   virtual int Enable(bool enable) = 0;
460   virtual bool is_enabled() const = 0;
461 
462  protected:
~HighPassFilter()463   virtual ~HighPassFilter() {};
464 };
465 
466 // An estimation component used to retrieve level metrics.
467 class LevelEstimator {
468  public:
469   virtual int Enable(bool enable) = 0;
470   virtual bool is_enabled() const = 0;
471 
472   // The metrics are reported in dBFs calculated as:
473   //   Level = 10log_10(P_s / P_max) [dBFs], where
474   //   P_s is the signal power and P_max is the maximum possible (or peak)
475   //   power. With 16-bit signals, P_max = (2^15)^2.
476   struct Metrics {
477     AudioProcessing::Statistic signal;  // Overall signal level.
478     AudioProcessing::Statistic speech;  // Speech level.
479     AudioProcessing::Statistic noise;   // Noise level.
480   };
481 
482   virtual int GetMetrics(Metrics* metrics, Metrics* reverse_metrics) = 0;
483 
484   //virtual int enable_noise_warning(bool enable) = 0;
485   //bool is_noise_warning_enabled() const = 0;
486   //virtual bool stream_has_high_noise() const = 0;
487 
488  protected:
~LevelEstimator()489   virtual ~LevelEstimator() {};
490 };
491 
492 // The noise suppression (NS) component attempts to remove noise while
493 // retaining speech. Recommended to be enabled on the client-side.
494 //
495 // Recommended to be enabled on the client-side.
496 class NoiseSuppression {
497  public:
498   virtual int Enable(bool enable) = 0;
499   virtual bool is_enabled() const = 0;
500 
501   // Determines the aggressiveness of the suppression. Increasing the level
502   // will reduce the noise level at the expense of a higher speech distortion.
503   enum Level {
504     kLow,
505     kModerate,
506     kHigh,
507     kVeryHigh
508   };
509 
510   virtual int set_level(Level level) = 0;
511   virtual Level level() const = 0;
512 
513  protected:
~NoiseSuppression()514   virtual ~NoiseSuppression() {};
515 };
516 
517 // The voice activity detection (VAD) component analyzes the stream to
518 // determine if voice is present. A facility is also provided to pass in an
519 // external VAD decision.
520 class VoiceDetection {
521  public:
522   virtual int Enable(bool enable) = 0;
523   virtual bool is_enabled() const = 0;
524 
525   // Returns true if voice is detected in the current frame. Should be called
526   // after |ProcessStream()|.
527   virtual bool stream_has_voice() const = 0;
528 
529   // Some of the APM functionality requires a VAD decision. In the case that
530   // a decision is externally available for the current frame, it can be passed
531   // in here, before |ProcessStream()| is called.
532   //
533   // VoiceDetection does _not_ need to be enabled to use this. If it happens to
534   // be enabled, detection will be skipped for any frame in which an external
535   // VAD decision is provided.
536   virtual int set_stream_has_voice(bool has_voice) = 0;
537 
538   // Specifies the likelihood that a frame will be declared to contain voice.
539   // A higher value makes it more likely that speech will not be clipped, at
540   // the expense of more noise being detected as voice.
541   enum Likelihood {
542     kVeryLowLikelihood,
543     kLowLikelihood,
544     kModerateLikelihood,
545     kHighLikelihood
546   };
547 
548   virtual int set_likelihood(Likelihood likelihood) = 0;
549   virtual Likelihood likelihood() const = 0;
550 
551   // Sets the |size| of the frames in ms on which the VAD will operate. Larger
552   // frames will improve detection accuracy, but reduce the frequency of
553   // updates.
554   //
555   // This does not impact the size of frames passed to |ProcessStream()|.
556   virtual int set_frame_size_ms(int size) = 0;
557   virtual int frame_size_ms() const = 0;
558 
559  protected:
~VoiceDetection()560   virtual ~VoiceDetection() {};
561 };
562 }  // namespace webrtc
563 
564 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_
565