• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
6 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
7 
8 #include "base/basictypes.h"
9 #include "content/browser/speech/endpointer/energy_endpointer.h"
10 #include "content/common/content_export.h"
11 
12 class EpStatus;
13 
14 namespace content {
15 
16 class AudioChunk;
17 
18 // A simple interface to the underlying energy-endpointer implementation, this
19 // class lets callers provide audio as being recorded and let them poll to find
20 // when the user has stopped speaking.
21 //
22 // There are two events that may trigger the end of speech:
23 //
24 // speechInputPossiblyComplete event:
25 //
26 // Signals that silence/noise has  been detected for a *short* amount of
27 // time after some speech has been detected. It can be used for low latency
28 // UI feedback. To disable it, set it to a large amount.
29 //
30 // speechInputComplete event:
31 //
32 // This event is intended to signal end of input and to stop recording.
33 // The amount of time to wait after speech is set by
34 // speech_input_complete_silence_length_ and optionally two other
35 // parameters (see below).
36 // This time can be held constant, or can change as more speech is detected.
37 // In the latter case, the time changes after a set amount of time from the
38 // *beginning* of speech.  This is motivated by the expectation that there
39 // will be two distinct types of inputs: short search queries and longer
40 // dictation style input.
41 //
42 // Three parameters are used to define the piecewise constant timeout function.
43 // The timeout length is speech_input_complete_silence_length until
44 // long_speech_length, when it changes to
45 // long_speech_input_complete_silence_length.
46 class CONTENT_EXPORT Endpointer {
47  public:
48   explicit Endpointer(int sample_rate);
49 
50   // Start the endpointer. This should be called at the beginning of a session.
51   void StartSession();
52 
53   // Stop the endpointer.
54   void EndSession();
55 
56   // Start environment estimation. Audio will be used for environment estimation
57   // i.e. noise level estimation.
58   void SetEnvironmentEstimationMode();
59 
60   // Start user input. This should be called when the user indicates start of
61   // input, e.g. by pressing a button.
62   void SetUserInputMode();
63 
64   // Process a segment of audio, which may be more than one frame.
65   // The status of the last frame will be returned.
66   EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
67 
68   // Get the status of the endpointer.
69   EpStatus Status(int64 *time_us);
70 
71   // Returns true if the endpointer detected reasonable audio levels above
72   // background noise which could be user speech, false if not.
DidStartReceivingSpeech()73   bool DidStartReceivingSpeech() const {
74     return speech_previously_detected_;
75   }
76 
IsEstimatingEnvironment()77   bool IsEstimatingEnvironment() const {
78     return energy_endpointer_.estimating_environment();
79   }
80 
set_speech_input_complete_silence_length(int64 time_us)81   void set_speech_input_complete_silence_length(int64 time_us) {
82     speech_input_complete_silence_length_us_ = time_us;
83   }
84 
set_long_speech_input_complete_silence_length(int64 time_us)85   void set_long_speech_input_complete_silence_length(int64 time_us) {
86     long_speech_input_complete_silence_length_us_ = time_us;
87   }
88 
set_speech_input_possibly_complete_silence_length(int64 time_us)89   void set_speech_input_possibly_complete_silence_length(int64 time_us) {
90     speech_input_possibly_complete_silence_length_us_ = time_us;
91   }
92 
set_long_speech_length(int64 time_us)93   void set_long_speech_length(int64 time_us) {
94     long_speech_length_us_ = time_us;
95   }
96 
speech_input_complete()97   bool speech_input_complete() const {
98     return speech_input_complete_;
99   }
100 
101   // RMS background noise level in dB.
NoiseLevelDb()102   float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
103 
104  private:
105   // Reset internal states. Helper method common to initial input utterance
106   // and following input utternaces.
107   void Reset();
108 
109   // Minimum allowable length of speech input.
110   int64 speech_input_minimum_length_us_;
111 
112   // The speechInputPossiblyComplete event signals that silence/noise has been
113   // detected for a *short* amount of time after some speech has been detected.
114   // This proporty specifies the time period.
115   int64 speech_input_possibly_complete_silence_length_us_;
116 
117   // The speechInputComplete event signals that silence/noise has been
118   // detected for a *long* amount of time after some speech has been detected.
119   // This property specifies the time period.
120   int64 speech_input_complete_silence_length_us_;
121 
122   // Same as above, this specifies the required silence period after speech
123   // detection. This period is used instead of
124   // speech_input_complete_silence_length_ when the utterance is longer than
125   // long_speech_length_. This parameter is optional.
126   int64 long_speech_input_complete_silence_length_us_;
127 
128   // The period of time after which the endpointer should consider
129   // long_speech_input_complete_silence_length_ as a valid silence period
130   // instead of speech_input_complete_silence_length_. This parameter is
131   // optional.
132   int64 long_speech_length_us_;
133 
134   // First speech onset time, used in determination of speech complete timeout.
135   int64 speech_start_time_us_;
136 
137   // Most recent end time, used in determination of speech complete timeout.
138   int64 speech_end_time_us_;
139 
140   int64 audio_frame_time_us_;
141   EpStatus old_ep_status_;
142   bool waiting_for_speech_possibly_complete_timeout_;
143   bool waiting_for_speech_complete_timeout_;
144   bool speech_previously_detected_;
145   bool speech_input_complete_;
146   EnergyEndpointer energy_endpointer_;
147   int sample_rate_;
148   int32 frame_size_;
149 };
150 
151 }  // namespace content
152 
153 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
154