1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // To know more about the algorithm used and the original code which this is
6 // based of, see
7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
8
9 #include "content/browser/speech/endpointer/energy_endpointer.h"
10
11 #include <math.h>
12
13 #include "base/logging.h"
14
15 namespace {
16
17 // Returns the RMS (quadratic mean) of the input signal.
RMS(const int16 * samples,int num_samples)18 float RMS(const int16* samples, int num_samples) {
19 int64 ssq_int64 = 0;
20 int64 sum_int64 = 0;
21 for (int i = 0; i < num_samples; ++i) {
22 sum_int64 += samples[i];
23 ssq_int64 += samples[i] * samples[i];
24 }
25 // now convert to floats.
26 double sum = static_cast<double>(sum_int64);
27 sum /= num_samples;
28 double ssq = static_cast<double>(ssq_int64);
29 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
30 }
31
Secs2Usecs(float seconds)32 int64 Secs2Usecs(float seconds) {
33 return static_cast<int64>(0.5 + (1.0e6 * seconds));
34 }
35
GetDecibel(float value)36 float GetDecibel(float value) {
37 if (value > 1.0e-100)
38 return 20 * log10(value);
39 return -2000.0;
40 }
41
42 } // namespace
43
44 namespace content {
45
46 // Stores threshold-crossing histories for making decisions about the speech
47 // state.
48 class EnergyEndpointer::HistoryRing {
49 public:
HistoryRing()50 HistoryRing() : insertion_index_(0) {}
51
52 // Resets the ring to |size| elements each with state |initial_state|
53 void SetRing(int size, bool initial_state);
54
55 // Inserts a new entry into the ring and drops the oldest entry.
56 void Insert(int64 time_us, bool decision);
57
58 // Returns the time in microseconds of the most recently added entry.
59 int64 EndTime() const;
60
61 // Returns the sum of all intervals during which 'decision' is true within
62 // the time in seconds specified by 'duration'. The returned interval is
63 // in seconds.
64 float RingSum(float duration_sec);
65
66 private:
67 struct DecisionPoint {
68 int64 time_us;
69 bool decision;
70 };
71
72 std::vector<DecisionPoint> decision_points_;
73 int insertion_index_; // Index at which the next item gets added/inserted.
74
75 DISALLOW_COPY_AND_ASSIGN(HistoryRing);
76 };
77
SetRing(int size,bool initial_state)78 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
79 insertion_index_ = 0;
80 decision_points_.clear();
81 DecisionPoint init = { -1, initial_state };
82 decision_points_.resize(size, init);
83 }
84
Insert(int64 time_us,bool decision)85 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
86 decision_points_[insertion_index_].time_us = time_us;
87 decision_points_[insertion_index_].decision = decision;
88 insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
89 }
90
EndTime() const91 int64 EnergyEndpointer::HistoryRing::EndTime() const {
92 int ind = insertion_index_ - 1;
93 if (ind < 0)
94 ind = decision_points_.size() - 1;
95 return decision_points_[ind].time_us;
96 }
97
RingSum(float duration_sec)98 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
99 if (!decision_points_.size())
100 return 0.0;
101
102 int64 sum_us = 0;
103 int ind = insertion_index_ - 1;
104 if (ind < 0)
105 ind = decision_points_.size() - 1;
106 int64 end_us = decision_points_[ind].time_us;
107 bool is_on = decision_points_[ind].decision;
108 int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
109 if (start_us < 0)
110 start_us = 0;
111 size_t n_summed = 1; // n points ==> (n-1) intervals
112 while ((decision_points_[ind].time_us > start_us) &&
113 (n_summed < decision_points_.size())) {
114 --ind;
115 if (ind < 0)
116 ind = decision_points_.size() - 1;
117 if (is_on)
118 sum_us += end_us - decision_points_[ind].time_us;
119 is_on = decision_points_[ind].decision;
120 end_us = decision_points_[ind].time_us;
121 n_summed++;
122 }
123
124 return 1.0e-6f * sum_us; // Returns total time that was super threshold.
125 }
126
EnergyEndpointer()127 EnergyEndpointer::EnergyEndpointer()
128 : status_(EP_PRE_SPEECH),
129 offset_confirm_dur_sec_(0),
130 endpointer_time_us_(0),
131 fast_update_frames_(0),
132 frame_counter_(0),
133 max_window_dur_(4.0),
134 sample_rate_(0),
135 history_(new HistoryRing()),
136 decision_threshold_(0),
137 estimating_environment_(false),
138 noise_level_(0),
139 rms_adapt_(0),
140 start_lag_(0),
141 end_lag_(0),
142 user_input_start_time_us_(0) {
143 }
144
~EnergyEndpointer()145 EnergyEndpointer::~EnergyEndpointer() {
146 }
147
TimeToFrame(float time) const148 int EnergyEndpointer::TimeToFrame(float time) const {
149 return static_cast<int32>(0.5 + (time / params_.frame_period()));
150 }
151
Restart(bool reset_threshold)152 void EnergyEndpointer::Restart(bool reset_threshold) {
153 status_ = EP_PRE_SPEECH;
154 user_input_start_time_us_ = 0;
155
156 if (reset_threshold) {
157 decision_threshold_ = params_.decision_threshold();
158 rms_adapt_ = decision_threshold_;
159 noise_level_ = params_.decision_threshold() / 2.0f;
160 frame_counter_ = 0; // Used for rapid initial update of levels.
161 }
162
163 // Set up the memories to hold the history windows.
164 history_->SetRing(TimeToFrame(max_window_dur_), false);
165
166 // Flag that indicates that current input should be used for
167 // estimating the environment. The user has not yet started input
168 // by e.g. pressed the push-to-talk button. By default, this is
169 // false for backward compatibility.
170 estimating_environment_ = false;
171 }
172
Init(const EnergyEndpointerParams & params)173 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
174 params_ = params;
175
176 // Find the longest history interval to be used, and make the ring
177 // large enough to accommodate that number of frames. NOTE: This
178 // depends upon ep_frame_period being set correctly in the factory
179 // that did this instantiation.
180 max_window_dur_ = params_.onset_window();
181 if (params_.speech_on_window() > max_window_dur_)
182 max_window_dur_ = params_.speech_on_window();
183 if (params_.offset_window() > max_window_dur_)
184 max_window_dur_ = params_.offset_window();
185 Restart(true);
186
187 offset_confirm_dur_sec_ = params_.offset_window() -
188 params_.offset_confirm_dur();
189 if (offset_confirm_dur_sec_ < 0.0)
190 offset_confirm_dur_sec_ = 0.0;
191
192 user_input_start_time_us_ = 0;
193
194 // Flag that indicates that current input should be used for
195 // estimating the environment. The user has not yet started input
196 // by e.g. pressed the push-to-talk button. By default, this is
197 // false for backward compatibility.
198 estimating_environment_ = false;
199 // The initial value of the noise and speech levels is inconsequential.
200 // The level of the first frame will overwrite these values.
201 noise_level_ = params_.decision_threshold() / 2.0f;
202 fast_update_frames_ =
203 static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
204
205 frame_counter_ = 0; // Used for rapid initial update of levels.
206
207 sample_rate_ = params_.sample_rate();
208 start_lag_ = static_cast<int>(sample_rate_ /
209 params_.max_fundamental_frequency());
210 end_lag_ = static_cast<int>(sample_rate_ /
211 params_.min_fundamental_frequency());
212 }
213
StartSession()214 void EnergyEndpointer::StartSession() {
215 Restart(true);
216 }
217
EndSession()218 void EnergyEndpointer::EndSession() {
219 status_ = EP_POST_SPEECH;
220 }
221
SetEnvironmentEstimationMode()222 void EnergyEndpointer::SetEnvironmentEstimationMode() {
223 Restart(true);
224 estimating_environment_ = true;
225 }
226
SetUserInputMode()227 void EnergyEndpointer::SetUserInputMode() {
228 estimating_environment_ = false;
229 user_input_start_time_us_ = endpointer_time_us_;
230 }
231
ProcessAudioFrame(int64 time_us,const int16 * samples,int num_samples,float * rms_out)232 void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
233 const int16* samples,
234 int num_samples,
235 float* rms_out) {
236 endpointer_time_us_ = time_us;
237 float rms = RMS(samples, num_samples);
238
239 // Check that this is user input audio vs. pre-input adaptation audio.
240 // Input audio starts when the user indicates start of input, by e.g.
241 // pressing push-to-talk. Audio received prior to that is used to update
242 // noise and speech level estimates.
243 if (!estimating_environment_) {
244 bool decision = false;
245 if ((endpointer_time_us_ - user_input_start_time_us_) <
246 Secs2Usecs(params_.contamination_rejection_period())) {
247 decision = false;
248 DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_;
249 } else {
250 decision = (rms > decision_threshold_);
251 }
252
253 history_->Insert(endpointer_time_us_, decision);
254
255 switch (status_) {
256 case EP_PRE_SPEECH:
257 if (history_->RingSum(params_.onset_window()) >
258 params_.onset_detect_dur()) {
259 status_ = EP_POSSIBLE_ONSET;
260 }
261 break;
262
263 case EP_POSSIBLE_ONSET: {
264 float tsum = history_->RingSum(params_.onset_window());
265 if (tsum > params_.onset_confirm_dur()) {
266 status_ = EP_SPEECH_PRESENT;
267 } else { // If signal is not maintained, drop back to pre-speech.
268 if (tsum <= params_.onset_detect_dur())
269 status_ = EP_PRE_SPEECH;
270 }
271 break;
272 }
273
274 case EP_SPEECH_PRESENT: {
275 // To induce hysteresis in the state residency, we allow a
276 // smaller residency time in the on_ring, than was required to
277 // enter the SPEECH_PERSENT state.
278 float on_time = history_->RingSum(params_.speech_on_window());
279 if (on_time < params_.on_maintain_dur())
280 status_ = EP_POSSIBLE_OFFSET;
281 break;
282 }
283
284 case EP_POSSIBLE_OFFSET:
285 if (history_->RingSum(params_.offset_window()) <=
286 offset_confirm_dur_sec_) {
287 // Note that this offset time may be beyond the end
288 // of the input buffer in a real-time system. It will be up
289 // to the RecognizerSession to decide what to do.
290 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
291 } else { // If speech picks up again we allow return to SPEECH_PRESENT.
292 if (history_->RingSum(params_.speech_on_window()) >=
293 params_.on_maintain_dur())
294 status_ = EP_SPEECH_PRESENT;
295 }
296 break;
297
298 default:
299 LOG(WARNING) << "Invalid case in switch: " << status_;
300 break;
301 }
302
303 // If this is a quiet, non-speech region, slowly adapt the detection
304 // threshold to be about 6dB above the average RMS.
305 if ((!decision) && (status_ == EP_PRE_SPEECH)) {
306 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
307 rms_adapt_ = decision_threshold_;
308 } else {
309 // If this is in a speech region, adapt the decision threshold to
310 // be about 10dB below the average RMS. If the noise level is high,
311 // the threshold is pushed up.
312 // Adaptation up to a higher level is 5 times faster than decay to
313 // a lower level.
314 if ((status_ == EP_SPEECH_PRESENT) && decision) {
315 if (rms_adapt_ > rms) {
316 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
317 } else {
318 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
319 }
320 float target_threshold = 0.3f * rms_adapt_ + noise_level_;
321 decision_threshold_ = (.90f * decision_threshold_) +
322 (0.10f * target_threshold);
323 }
324 }
325
326 // Set a floor
327 if (decision_threshold_ < params_.min_decision_threshold())
328 decision_threshold_ = params_.min_decision_threshold();
329 }
330
331 // Update speech and noise levels.
332 UpdateLevels(rms);
333 ++frame_counter_;
334
335 if (rms_out)
336 *rms_out = GetDecibel(rms);
337 }
338
GetNoiseLevelDb() const339 float EnergyEndpointer::GetNoiseLevelDb() const {
340 return GetDecibel(noise_level_);
341 }
342
UpdateLevels(float rms)343 void EnergyEndpointer::UpdateLevels(float rms) {
344 // Update quickly initially. We assume this is noise and that
345 // speech is 6dB above the noise.
346 if (frame_counter_ < fast_update_frames_) {
347 // Alpha increases from 0 to (k-1)/k where k is the number of time
348 // steps in the initial adaptation period.
349 float alpha = static_cast<float>(frame_counter_) /
350 static_cast<float>(fast_update_frames_);
351 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
352 DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_
353 << ", fast_update_frames_ " << fast_update_frames_;
354 } else {
355 // Update Noise level. The noise level adapts quickly downward, but
356 // slowly upward. The noise_level_ parameter is not currently used
357 // for threshold adaptation. It is used for UI feedback.
358 if (noise_level_ < rms)
359 noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
360 else
361 noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
362 }
363 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
364 decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
365 // Set a floor
366 if (decision_threshold_ < params_.min_decision_threshold())
367 decision_threshold_ = params_.min_decision_threshold();
368 }
369 }
370
Status(int64 * status_time) const371 EpStatus EnergyEndpointer::Status(int64* status_time) const {
372 *status_time = history_->EndTime();
373 return status_;
374 }
375
376 } // namespace content
377