1 /*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "common_audio/vad/vad_core.h"
12
13 #include "rtc_base/sanitizer.h"
14 #include "common_audio/signal_processing/include/signal_processing_library.h"
15 #include "common_audio/vad/vad_filterbank.h"
16 #include "common_audio/vad/vad_gmm.h"
17 #include "common_audio/vad/vad_sp.h"
18
19 // Spectrum Weighting
20 static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
21 static const int16_t kNoiseUpdateConst = 655; // Q15
22 static const int16_t kSpeechUpdateConst = 6554; // Q15
23 static const int16_t kBackEta = 154; // Q8
24 // Minimum difference between the two models, Q5
25 static const int16_t kMinimumDifference[kNumChannels] = {
26 544, 544, 576, 576, 576, 576 };
27 // Upper limit of mean value for speech model, Q7
28 static const int16_t kMaximumSpeech[kNumChannels] = {
29 11392, 11392, 11520, 11520, 11520, 11520 };
30 // Minimum value for mean value
31 static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
32 // Upper limit of mean value for noise model, Q7
33 static const int16_t kMaximumNoise[kNumChannels] = {
34 9216, 9088, 8960, 8832, 8704, 8576 };
35 // Start values for the Gaussian models, Q7
36 // Weights for the two Gaussians for the six channels (noise)
37 static const int16_t kNoiseDataWeights[kTableSize] = {
38 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
39 // Weights for the two Gaussians for the six channels (speech)
40 static const int16_t kSpeechDataWeights[kTableSize] = {
41 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
42 // Means for the two Gaussians for the six channels (noise)
43 static const int16_t kNoiseDataMeans[kTableSize] = {
44 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
45 // Means for the two Gaussians for the six channels (speech)
46 static const int16_t kSpeechDataMeans[kTableSize] = {
47 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
48 };
49 // Stds for the two Gaussians for the six channels (noise)
50 static const int16_t kNoiseDataStds[kTableSize] = {
51 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
52 // Stds for the two Gaussians for the six channels (speech)
53 static const int16_t kSpeechDataStds[kTableSize] = {
54 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
55
56 // Constants used in GmmProbability().
57 //
58 // Maximum number of counted speech (VAD = 1) frames in a row.
59 static const int16_t kMaxSpeechFrames = 6;
60 // Minimum standard deviation for both speech and noise.
61 static const int16_t kMinStd = 384;
62
63 // Constants in WebRtcVad_InitCore().
64 // Default aggressiveness mode.
65 static const short kDefaultMode = 0;
66 static const int kInitCheck = 42;
67
68 // Constants used in WebRtcVad_set_mode_core().
69 //
70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
71 //
72 // Mode 0, Quality.
73 static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
74 static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
75 static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
76 static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
77 // Mode 1, Low bitrate.
78 static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
79 static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
80 static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
81 static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
82 // Mode 2, Aggressive.
83 static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
84 static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
85 static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
86 static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
87 // Mode 3, Very aggressive.
88 static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
89 static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
90 static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
91 static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
92
93 // Calculates the weighted average w.r.t. number of Gaussians. The `data` are
94 // updated with an `offset` before averaging.
95 //
96 // - data [i/o] : Data to average.
97 // - offset [i] : An offset added to `data`.
98 // - weights [i] : Weights used for averaging.
99 //
100 // returns : The weighted average.
WeightedAverage(int16_t * data,int16_t offset,const int16_t * weights)101 static int32_t WeightedAverage(int16_t* data, int16_t offset,
102 const int16_t* weights) {
103 int k;
104 int32_t weighted_average = 0;
105
106 for (k = 0; k < kNumGaussians; k++) {
107 data[k * kNumChannels] += offset;
108 weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
109 }
110 return weighted_average;
111 }
112
113 // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
114 // undefined behavior, so not a good idea; this just makes UBSan ignore the
115 // violation, so that our old code can continue to do what it's always been
116 // doing.)
117 static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
OverflowingMulS16ByS32ToS32(int16_t a,int32_t b)118 OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) {
119 return a * b;
120 }
121
122 // Calculates the probabilities for both speech and background noise using
123 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
124 // type of signal is most probable.
125 //
126 // - self [i/o] : Pointer to VAD instance
127 // - features [i] : Feature vector of length `kNumChannels`
128 // = log10(energy in frequency band)
129 // - total_power [i] : Total power in audio frame.
130 // - frame_length [i] : Number of input samples
131 //
132 // - returns : the VAD decision (0 - noise, 1 - speech).
GmmProbability(VadInstT * self,int16_t * features,int16_t total_power,size_t frame_length)133 static int16_t GmmProbability(VadInstT* self, int16_t* features,
134 int16_t total_power, size_t frame_length) {
135 int channel, k;
136 int16_t feature_minimum;
137 int16_t h0, h1;
138 int16_t log_likelihood_ratio;
139 int16_t vadflag = 0;
140 int16_t shifts_h0, shifts_h1;
141 int16_t tmp_s16, tmp1_s16, tmp2_s16;
142 int16_t diff;
143 int gaussian;
144 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
145 int16_t delt, ndelt;
146 int16_t maxspe, maxmu;
147 int16_t deltaN[kTableSize], deltaS[kTableSize];
148 int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
149 int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
150 int32_t h0_test, h1_test;
151 int32_t tmp1_s32, tmp2_s32;
152 int32_t sum_log_likelihood_ratios = 0;
153 int32_t noise_global_mean, speech_global_mean;
154 int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
155 int16_t overhead1, overhead2, individualTest, totalTest;
156
157 // Set various thresholds based on frame lengths (80, 160 or 240 samples).
158 if (frame_length == 80) {
159 overhead1 = self->over_hang_max_1[0];
160 overhead2 = self->over_hang_max_2[0];
161 individualTest = self->individual[0];
162 totalTest = self->total[0];
163 } else if (frame_length == 160) {
164 overhead1 = self->over_hang_max_1[1];
165 overhead2 = self->over_hang_max_2[1];
166 individualTest = self->individual[1];
167 totalTest = self->total[1];
168 } else {
169 overhead1 = self->over_hang_max_1[2];
170 overhead2 = self->over_hang_max_2[2];
171 individualTest = self->individual[2];
172 totalTest = self->total[2];
173 }
174
175 if (total_power > kMinEnergy) {
176 // The signal power of current frame is large enough for processing. The
177 // processing consists of two parts:
178 // 1) Calculating the likelihood of speech and thereby a VAD decision.
179 // 2) Updating the underlying model, w.r.t., the decision made.
180
181 // The detection scheme is an LRT with hypothesis
182 // H0: Noise
183 // H1: Speech
184 //
185 // We combine a global LRT with local tests, for each frequency sub-band,
186 // here defined as `channel`.
187 for (channel = 0; channel < kNumChannels; channel++) {
188 // For each channel we model the probability with a GMM consisting of
189 // `kNumGaussians`, with different means and standard deviations depending
190 // on H0 or H1.
191 h0_test = 0;
192 h1_test = 0;
193 for (k = 0; k < kNumGaussians; k++) {
194 gaussian = channel + k * kNumChannels;
195 // Probability under H0, that is, probability of frame being noise.
196 // Value given in Q27 = Q7 * Q20.
197 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
198 self->noise_means[gaussian],
199 self->noise_stds[gaussian],
200 &deltaN[gaussian]);
201 noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
202 h0_test += noise_probability[k]; // Q27
203
204 // Probability under H1, that is, probability of frame being speech.
205 // Value given in Q27 = Q7 * Q20.
206 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
207 self->speech_means[gaussian],
208 self->speech_stds[gaussian],
209 &deltaS[gaussian]);
210 speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
211 h1_test += speech_probability[k]; // Q27
212 }
213
214 // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
215 // Approximation:
216 // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
217 // = log2(h1_test) - log2(h0_test)
218 // = log2(2^(31-shifts_h1)*(1+b1))
219 // - log2(2^(31-shifts_h0)*(1+b0))
220 // = shifts_h0 - shifts_h1
221 // + log2(1+b1) - log2(1+b0)
222 // ~= shifts_h0 - shifts_h1
223 //
224 // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
225 // Further, b0 and b1 are independent and on the average the two terms
226 // cancel.
227 shifts_h0 = WebRtcSpl_NormW32(h0_test);
228 shifts_h1 = WebRtcSpl_NormW32(h1_test);
229 if (h0_test == 0) {
230 shifts_h0 = 31;
231 }
232 if (h1_test == 0) {
233 shifts_h1 = 31;
234 }
235 log_likelihood_ratio = shifts_h0 - shifts_h1;
236
237 // Update `sum_log_likelihood_ratios` with spectrum weighting. This is
238 // used for the global VAD decision.
239 sum_log_likelihood_ratios +=
240 (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
241
242 // Local VAD decision.
243 if ((log_likelihood_ratio * 4) > individualTest) {
244 vadflag = 1;
245 }
246
247 // TODO(bjornv): The conditional probabilities below are applied on the
248 // hard coded number of Gaussians set to two. Find a way to generalize.
249 // Calculate local noise probabilities used later when updating the GMM.
250 h0 = (int16_t) (h0_test >> 12); // Q15
251 if (h0 > 0) {
252 // High probability of noise. Assign conditional probabilities for each
253 // Gaussian in the GMM.
254 tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
255 ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
256 ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
257 } else {
258 // Low noise probability. Assign conditional probability 1 to the first
259 // Gaussian and 0 to the rest (which is already set at initialization).
260 ngprvec[channel] = 16384;
261 }
262
263 // Calculate local speech probabilities used later when updating the GMM.
264 h1 = (int16_t) (h1_test >> 12); // Q15
265 if (h1 > 0) {
266 // High probability of speech. Assign conditional probabilities for each
267 // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
268 tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
269 sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
270 sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
271 }
272 }
273
274 // Make a global VAD decision.
275 vadflag |= (sum_log_likelihood_ratios >= totalTest);
276
277 // Update the model parameters.
278 maxspe = 12800;
279 for (channel = 0; channel < kNumChannels; channel++) {
280
281 // Get minimum value in past which is used for long term correction in Q4.
282 feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
283
284 // Compute the "global" mean, that is the sum of the two means weighted.
285 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
286 &kNoiseDataWeights[channel]);
287 tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
288
289 for (k = 0; k < kNumGaussians; k++) {
290 gaussian = channel + k * kNumChannels;
291
292 nmk = self->noise_means[gaussian];
293 smk = self->speech_means[gaussian];
294 nsk = self->noise_stds[gaussian];
295 ssk = self->speech_stds[gaussian];
296
297 // Update noise mean vector if the frame consists of noise only.
298 nmk2 = nmk;
299 if (!vadflag) {
300 // deltaN = (x-mu)/sigma^2
301 // ngprvec[k] = `noise_probability[k]` /
302 // (`noise_probability[0]` + `noise_probability[1]`)
303
304 // (Q14 * Q11 >> 11) = Q14.
305 delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
306 // Q7 + (Q14 * Q15 >> 22) = Q7.
307 nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
308 }
309
310 // Long term correction of the noise mean.
311 // Q8 - Q8 = Q8.
312 ndelt = (feature_minimum << 4) - tmp1_s16;
313 // Q7 + (Q8 * Q8) >> 9 = Q7.
314 nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
315
316 // Control that the noise mean does not drift to much.
317 tmp_s16 = (int16_t) ((k + 5) << 7);
318 if (nmk3 < tmp_s16) {
319 nmk3 = tmp_s16;
320 }
321 tmp_s16 = (int16_t) ((72 + k - channel) << 7);
322 if (nmk3 > tmp_s16) {
323 nmk3 = tmp_s16;
324 }
325 self->noise_means[gaussian] = nmk3;
326
327 if (vadflag) {
328 // Update speech mean vector:
329 // `deltaS` = (x-mu)/sigma^2
330 // sgprvec[k] = `speech_probability[k]` /
331 // (`speech_probability[0]` + `speech_probability[1]`)
332
333 // (Q14 * Q11) >> 11 = Q14.
334 delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
335 // Q14 * Q15 >> 21 = Q8.
336 tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
337 // Q7 + (Q8 >> 1) = Q7. With rounding.
338 smk2 = smk + ((tmp_s16 + 1) >> 1);
339
340 // Control that the speech mean does not drift to much.
341 maxmu = maxspe + 640;
342 if (smk2 < kMinimumMean[k]) {
343 smk2 = kMinimumMean[k];
344 }
345 if (smk2 > maxmu) {
346 smk2 = maxmu;
347 }
348 self->speech_means[gaussian] = smk2; // Q7.
349
350 // (Q7 >> 3) = Q4. With rounding.
351 tmp_s16 = ((smk + 4) >> 3);
352
353 tmp_s16 = features[channel] - tmp_s16; // Q4
354 // (Q11 * Q4 >> 3) = Q12.
355 tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
356 tmp2_s32 = tmp1_s32 - 4096;
357 tmp_s16 = sgprvec[gaussian] >> 2;
358 // (Q14 >> 2) * Q12 = Q24.
359 tmp1_s32 = tmp_s16 * tmp2_s32;
360
361 tmp2_s32 = tmp1_s32 >> 4; // Q20
362
363 // 0.1 * Q20 / Q7 = Q13.
364 if (tmp2_s32 > 0) {
365 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
366 } else {
367 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
368 tmp_s16 = -tmp_s16;
369 }
370 // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
371 // Note that division by 4 equals shift by 2, hence,
372 // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
373 tmp_s16 += 128; // Rounding.
374 ssk += (tmp_s16 >> 8);
375 if (ssk < kMinStd) {
376 ssk = kMinStd;
377 }
378 self->speech_stds[gaussian] = ssk;
379 } else {
380 // Update GMM variance vectors.
381 // deltaN * (features[channel] - nmk) - 1
382 // Q4 - (Q7 >> 3) = Q4.
383 tmp_s16 = features[channel] - (nmk >> 3);
384 // (Q11 * Q4 >> 3) = Q12.
385 tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
386 tmp1_s32 -= 4096;
387
388 // (Q14 >> 2) * Q12 = Q24.
389 tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
390 tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
391 // Q20 * approx 0.001 (2^-10=0.0009766), hence,
392 // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
393 tmp1_s32 = tmp2_s32 >> 14;
394
395 // Q20 / Q7 = Q13.
396 if (tmp1_s32 > 0) {
397 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
398 } else {
399 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
400 tmp_s16 = -tmp_s16;
401 }
402 tmp_s16 += 32; // Rounding
403 nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
404 if (nsk < kMinStd) {
405 nsk = kMinStd;
406 }
407 self->noise_stds[gaussian] = nsk;
408 }
409 }
410
411 // Separate models if they are too close.
412 // `noise_global_mean` in Q14 (= Q7 * Q7).
413 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
414 &kNoiseDataWeights[channel]);
415
416 // `speech_global_mean` in Q14 (= Q7 * Q7).
417 speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
418 &kSpeechDataWeights[channel]);
419
420 // `diff` = "global" speech mean - "global" noise mean.
421 // (Q14 >> 9) - (Q14 >> 9) = Q5.
422 diff = (int16_t) (speech_global_mean >> 9) -
423 (int16_t) (noise_global_mean >> 9);
424 if (diff < kMinimumDifference[channel]) {
425 tmp_s16 = kMinimumDifference[channel] - diff;
426
427 // `tmp1_s16` = ~0.8 * (kMinimumDifference - diff) in Q7.
428 // `tmp2_s16` = ~0.2 * (kMinimumDifference - diff) in Q7.
429 tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
430 tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
431
432 // Move Gaussian means for speech model by `tmp1_s16` and update
433 // `speech_global_mean`. Note that `self->speech_means[channel]` is
434 // changed after the call.
435 speech_global_mean = WeightedAverage(&self->speech_means[channel],
436 tmp1_s16,
437 &kSpeechDataWeights[channel]);
438
439 // Move Gaussian means for noise model by -`tmp2_s16` and update
440 // `noise_global_mean`. Note that `self->noise_means[channel]` is
441 // changed after the call.
442 noise_global_mean = WeightedAverage(&self->noise_means[channel],
443 -tmp2_s16,
444 &kNoiseDataWeights[channel]);
445 }
446
447 // Control that the speech & noise means do not drift to much.
448 maxspe = kMaximumSpeech[channel];
449 tmp2_s16 = (int16_t) (speech_global_mean >> 7);
450 if (tmp2_s16 > maxspe) {
451 // Upper limit of speech model.
452 tmp2_s16 -= maxspe;
453
454 for (k = 0; k < kNumGaussians; k++) {
455 self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
456 }
457 }
458
459 tmp2_s16 = (int16_t) (noise_global_mean >> 7);
460 if (tmp2_s16 > kMaximumNoise[channel]) {
461 tmp2_s16 -= kMaximumNoise[channel];
462
463 for (k = 0; k < kNumGaussians; k++) {
464 self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
465 }
466 }
467 }
468 self->frame_counter++;
469 }
470
471 // Smooth with respect to transition hysteresis.
472 if (!vadflag) {
473 if (self->over_hang > 0) {
474 vadflag = 2 + self->over_hang;
475 self->over_hang--;
476 }
477 self->num_of_speech = 0;
478 } else {
479 self->num_of_speech++;
480 if (self->num_of_speech > kMaxSpeechFrames) {
481 self->num_of_speech = kMaxSpeechFrames;
482 self->over_hang = overhead2;
483 } else {
484 self->over_hang = overhead1;
485 }
486 }
487 return vadflag;
488 }
489
490 // Initialize the VAD. Set aggressiveness mode to default value.
WebRtcVad_InitCore(VadInstT * self)491 int WebRtcVad_InitCore(VadInstT* self) {
492 int i;
493
494 if (self == NULL) {
495 return -1;
496 }
497
498 // Initialization of general struct variables.
499 self->vad = 1; // Speech active (=1).
500 self->frame_counter = 0;
501 self->over_hang = 0;
502 self->num_of_speech = 0;
503
504 // Initialization of downsampling filter state.
505 memset(self->downsampling_filter_states, 0,
506 sizeof(self->downsampling_filter_states));
507
508 // Initialization of 48 to 8 kHz downsampling.
509 WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
510
511 // Read initial PDF parameters.
512 for (i = 0; i < kTableSize; i++) {
513 self->noise_means[i] = kNoiseDataMeans[i];
514 self->speech_means[i] = kSpeechDataMeans[i];
515 self->noise_stds[i] = kNoiseDataStds[i];
516 self->speech_stds[i] = kSpeechDataStds[i];
517 }
518
519 // Initialize Index and Minimum value vectors.
520 for (i = 0; i < 16 * kNumChannels; i++) {
521 self->low_value_vector[i] = 10000;
522 self->index_vector[i] = 0;
523 }
524
525 // Initialize splitting filter states.
526 memset(self->upper_state, 0, sizeof(self->upper_state));
527 memset(self->lower_state, 0, sizeof(self->lower_state));
528
529 // Initialize high pass filter states.
530 memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
531
532 // Initialize mean value memory, for WebRtcVad_FindMinimum().
533 for (i = 0; i < kNumChannels; i++) {
534 self->mean_value[i] = 1600;
535 }
536
537 // Set aggressiveness mode to default (=`kDefaultMode`).
538 if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
539 return -1;
540 }
541
542 self->init_flag = kInitCheck;
543
544 return 0;
545 }
546
547 // Set aggressiveness mode
WebRtcVad_set_mode_core(VadInstT * self,int mode)548 int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
549 int return_value = 0;
550
551 switch (mode) {
552 case 0:
553 // Quality mode.
554 memcpy(self->over_hang_max_1, kOverHangMax1Q,
555 sizeof(self->over_hang_max_1));
556 memcpy(self->over_hang_max_2, kOverHangMax2Q,
557 sizeof(self->over_hang_max_2));
558 memcpy(self->individual, kLocalThresholdQ,
559 sizeof(self->individual));
560 memcpy(self->total, kGlobalThresholdQ,
561 sizeof(self->total));
562 break;
563 case 1:
564 // Low bitrate mode.
565 memcpy(self->over_hang_max_1, kOverHangMax1LBR,
566 sizeof(self->over_hang_max_1));
567 memcpy(self->over_hang_max_2, kOverHangMax2LBR,
568 sizeof(self->over_hang_max_2));
569 memcpy(self->individual, kLocalThresholdLBR,
570 sizeof(self->individual));
571 memcpy(self->total, kGlobalThresholdLBR,
572 sizeof(self->total));
573 break;
574 case 2:
575 // Aggressive mode.
576 memcpy(self->over_hang_max_1, kOverHangMax1AGG,
577 sizeof(self->over_hang_max_1));
578 memcpy(self->over_hang_max_2, kOverHangMax2AGG,
579 sizeof(self->over_hang_max_2));
580 memcpy(self->individual, kLocalThresholdAGG,
581 sizeof(self->individual));
582 memcpy(self->total, kGlobalThresholdAGG,
583 sizeof(self->total));
584 break;
585 case 3:
586 // Very aggressive mode.
587 memcpy(self->over_hang_max_1, kOverHangMax1VAG,
588 sizeof(self->over_hang_max_1));
589 memcpy(self->over_hang_max_2, kOverHangMax2VAG,
590 sizeof(self->over_hang_max_2));
591 memcpy(self->individual, kLocalThresholdVAG,
592 sizeof(self->individual));
593 memcpy(self->total, kGlobalThresholdVAG,
594 sizeof(self->total));
595 break;
596 default:
597 return_value = -1;
598 break;
599 }
600
601 return return_value;
602 }
603
604 // Calculate VAD decision by first extracting feature values and then calculate
605 // probability for both speech and background noise.
606
WebRtcVad_CalcVad48khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)607 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
608 size_t frame_length) {
609 int vad;
610 size_t i;
611 int16_t speech_nb[240]; // 30 ms in 8 kHz.
612 // `tmp_mem` is a temporary memory used by resample function, length is
613 // frame length in 10 ms (480 samples) + 256 extra.
614 int32_t tmp_mem[480 + 256] = { 0 };
615 const size_t kFrameLen10ms48khz = 480;
616 const size_t kFrameLen10ms8khz = 80;
617 size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
618
619 for (i = 0; i < num_10ms_frames; i++) {
620 WebRtcSpl_Resample48khzTo8khz(speech_frame,
621 &speech_nb[i * kFrameLen10ms8khz],
622 &inst->state_48_to_8,
623 tmp_mem);
624 }
625
626 // Do VAD on an 8 kHz signal
627 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
628
629 return vad;
630 }
631
WebRtcVad_CalcVad32khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)632 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
633 size_t frame_length)
634 {
635 size_t len;
636 int vad;
637 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
638 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
639
640
641 // Downsample signal 32->16->8 before doing VAD
642 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
643 frame_length);
644 len = frame_length / 2;
645
646 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
647 len /= 2;
648
649 // Do VAD on an 8 kHz signal
650 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
651
652 return vad;
653 }
654
WebRtcVad_CalcVad16khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)655 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
656 size_t frame_length)
657 {
658 size_t len;
659 int vad;
660 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
661
662 // Wideband: Downsample signal before doing VAD
663 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
664 frame_length);
665
666 len = frame_length / 2;
667 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
668
669 return vad;
670 }
671
WebRtcVad_CalcVad8khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)672 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
673 size_t frame_length)
674 {
675 int16_t feature_vector[kNumChannels], total_power;
676
677 // Get power in the bands
678 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
679 feature_vector);
680
681 // Make a VAD
682 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
683
684 return inst->vad;
685 }
686