• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/vad/vad_audio_proc.h"
12 
13 #include <math.h>
14 #include <stdio.h>
15 #include <string.h>
16 
17 #include "common_audio/third_party/ooura/fft_size_256/fft4g.h"
18 #include "modules/audio_processing/vad/pitch_internal.h"
19 #include "modules/audio_processing/vad/pole_zero_filter.h"
20 #include "modules/audio_processing/vad/vad_audio_proc_internal.h"
21 #include "rtc_base/checks.h"
22 extern "C" {
23 #include "modules/audio_coding/codecs/isac/main/source/filter_functions.h"
24 #include "modules/audio_coding/codecs/isac/main/source/isac_vad.h"
25 #include "modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
26 #include "modules/audio_coding/codecs/isac/main/source/structs.h"
27 }
28 
29 namespace webrtc {
30 
31 // The following structures are declared anonymous in iSAC's structs.h. To
32 // forward declare them, we use this derived class trick.
33 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
34 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
35 
36 static constexpr float kFrequencyResolution =
37     kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
38 static constexpr int kSilenceRms = 5;
39 
40 // TODO(turajs): Make a Create or Init for VadAudioProc.
VadAudioProc()41 VadAudioProc::VadAudioProc()
42     : audio_buffer_(),
43       num_buffer_samples_(kNumPastSignalSamples),
44       log_old_gain_(-2),
45       old_lag_(50),  // Arbitrary but valid as pitch-lag (in samples).
46       pitch_analysis_handle_(new PitchAnalysisStruct),
47       pre_filter_handle_(new PreFiltBankstr),
48       high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
49                                                kFilterOrder,
50                                                kCoeffDenominator,
51                                                kFilterOrder)) {
52   static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
53                     sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
54                 "lpc analysis window incorrect size");
55   static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
56                 "correlation weight incorrect size");
57 
58   // TODO(turajs): Are we doing too much in the constructor?
59   float data[kDftSize];
60   // Make FFT to initialize.
61   ip_[0] = 0;
62   WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
63   // TODO(turajs): Need to initialize high-pass filter.
64 
65   // Initialize iSAC components.
66   WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
67   WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
68 }
69 
~VadAudioProc()70 VadAudioProc::~VadAudioProc() {}
71 
ResetBuffer()72 void VadAudioProc::ResetBuffer() {
73   memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
74          sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
75   num_buffer_samples_ = kNumPastSignalSamples;
76 }
77 
ExtractFeatures(const int16_t * frame,size_t length,AudioFeatures * features)78 int VadAudioProc::ExtractFeatures(const int16_t* frame,
79                                   size_t length,
80                                   AudioFeatures* features) {
81   features->num_frames = 0;
82   if (length != kNumSubframeSamples) {
83     return -1;
84   }
85 
86   // High-pass filter to remove the DC component and very low frequency content.
87   // We have experienced that this high-pass filtering improves voice/non-voiced
88   // classification.
89   if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
90                                 &audio_buffer_[num_buffer_samples_]) != 0) {
91     return -1;
92   }
93 
94   num_buffer_samples_ += kNumSubframeSamples;
95   if (num_buffer_samples_ < kBufferLength) {
96     return 0;
97   }
98   RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength);
99   features->num_frames = kNum10msSubframes;
100   features->silence = false;
101 
102   Rms(features->rms, kMaxNumFrames);
103   for (size_t i = 0; i < kNum10msSubframes; ++i) {
104     if (features->rms[i] < kSilenceRms) {
105       // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
106       // Bail out here instead.
107       features->silence = true;
108       ResetBuffer();
109       return 0;
110     }
111   }
112 
113   PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
114                 kMaxNumFrames);
115   FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
116   ResetBuffer();
117   return 0;
118 }
119 
120 // Computes |kLpcOrder + 1| correlation coefficients.
SubframeCorrelation(double * corr,size_t length_corr,size_t subframe_index)121 void VadAudioProc::SubframeCorrelation(double* corr,
122                                        size_t length_corr,
123                                        size_t subframe_index) {
124   RTC_DCHECK_GE(length_corr, kLpcOrder + 1);
125   double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
126   size_t buffer_index = subframe_index * kNumSubframeSamples;
127 
128   for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
129     windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
130 
131   WebRtcIsac_AutoCorr(corr, windowed_audio,
132                       kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
133 }
134 
135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
136 // The analysis window is 15 ms long and it is centered on the first half of
137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
138 // first half of each 10 ms subframe.
GetLpcPolynomials(double * lpc,size_t length_lpc)139 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
140   RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1));
141   double corr[kLpcOrder + 1];
142   double reflec_coeff[kLpcOrder];
143   for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
144        i++, offset_lpc += kLpcOrder + 1) {
145     SubframeCorrelation(corr, kLpcOrder + 1, i);
146     corr[0] *= 1.0001;
147     // This makes Lev-Durb a bit more stable.
148     for (size_t k = 0; k < kLpcOrder + 1; k++) {
149       corr[k] *= kCorrWeight[k];
150     }
151     WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
152   }
153 }
154 
155 // Fit a second order curve to these 3 points and find the location of the
156 // extremum. The points are inverted before curve fitting.
QuadraticInterpolation(float prev_val,float curr_val,float next_val)157 static float QuadraticInterpolation(float prev_val,
158                                     float curr_val,
159                                     float next_val) {
160   // Doing the interpolation in |1 / A(z)|^2.
161   float fractional_index = 0;
162   next_val = 1.0f / next_val;
163   prev_val = 1.0f / prev_val;
164   curr_val = 1.0f / curr_val;
165 
166   fractional_index =
167       -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
168   RTC_DCHECK_LT(fabs(fractional_index), 1);
169   return fractional_index;
170 }
171 
172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
173 // of the input signal. The local maximum of the spectral envelope corresponds
174 // with the local minimum of A(z). It saves complexity, as we save one
175 // inversion. Furthermore, we find the first local maximum of magnitude squared,
176 // to save on one square root.
FindFirstSpectralPeaks(double * f_peak,size_t length_f_peak)177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
178                                           size_t length_f_peak) {
179   RTC_DCHECK_GE(length_f_peak, kNum10msSubframes);
180   double lpc[kNum10msSubframes * (kLpcOrder + 1)];
181   // For all sub-frames.
182   GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
183 
184   const size_t kNumDftCoefficients = kDftSize / 2 + 1;
185   float data[kDftSize];
186 
187   for (size_t i = 0; i < kNum10msSubframes; i++) {
188     // Convert to float with zero pad.
189     memset(data, 0, sizeof(data));
190     for (size_t n = 0; n < kLpcOrder + 1; n++) {
191       data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
192     }
193     // Transform to frequency domain.
194     WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
195 
196     size_t index_peak = 0;
197     float prev_magn_sqr = data[0] * data[0];
198     float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
199     float next_magn_sqr;
200     bool found_peak = false;
201     for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
202       next_magn_sqr =
203           data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
204       if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
205         found_peak = true;
206         index_peak = n - 1;
207         break;
208       }
209       prev_magn_sqr = curr_magn_sqr;
210       curr_magn_sqr = next_magn_sqr;
211     }
212     float fractional_index = 0;
213     if (!found_peak) {
214       // Checking if |kNumDftCoefficients - 1| is the local minimum.
215       next_magn_sqr = data[1] * data[1];
216       if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
217         index_peak = kNumDftCoefficients - 1;
218       }
219     } else {
220       // A peak is found, do a simple quadratic interpolation to get a more
221       // accurate estimate of the peak location.
222       fractional_index =
223           QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
224     }
225     f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
226   }
227 }
228 
229 // Using iSAC functions to estimate pitch gains & lags.
PitchAnalysis(double * log_pitch_gains,double * pitch_lags_hz,size_t length)230 void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
231                                  double* pitch_lags_hz,
232                                  size_t length) {
233   // TODO(turajs): This can be "imported" from iSAC & and the next two
234   // constants.
235   RTC_DCHECK_GE(length, kNum10msSubframes);
236   const int kNumPitchSubframes = 4;
237   double gains[kNumPitchSubframes];
238   double lags[kNumPitchSubframes];
239 
240   const int kNumSubbandFrameSamples = 240;
241   const int kNumLookaheadSamples = 24;
242 
243   float lower[kNumSubbandFrameSamples];
244   float upper[kNumSubbandFrameSamples];
245   double lower_lookahead[kNumSubbandFrameSamples];
246   double upper_lookahead[kNumSubbandFrameSamples];
247   double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
248                                     kNumLookaheadSamples];
249 
250   // Split signal to lower and upper bands
251   WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
252                                  upper, lower_lookahead, upper_lookahead,
253                                  pre_filter_handle_.get());
254   WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
255                            pitch_analysis_handle_.get(), lags, gains);
256 
257   // Lags are computed on lower-band signal with sampling rate half of the
258   // input signal.
259   GetSubframesPitchParameters(
260       kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
261       &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
262 }
263 
Rms(double * rms,size_t length_rms)264 void VadAudioProc::Rms(double* rms, size_t length_rms) {
265   RTC_DCHECK_GE(length_rms, kNum10msSubframes);
266   size_t offset = kNumPastSignalSamples;
267   for (size_t i = 0; i < kNum10msSubframes; i++) {
268     rms[i] = 0;
269     for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
270       rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
271     rms[i] = sqrt(rms[i] / kNumSubframeSamples);
272   }
273 }
274 
275 }  // namespace webrtc
276