• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
6 #define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
7 
8 #include <string>
9 #include <vector>
10 
11 #include "base/basictypes.h"
12 #include "base/memory/ref_counted.h"
13 #include "base/memory/scoped_ptr.h"
14 #include "base/threading/non_thread_safe.h"
15 #include "content/browser/speech/audio_encoder.h"
16 #include "content/browser/speech/chunked_byte_buffer.h"
17 #include "content/browser/speech/speech_recognition_engine.h"
18 #include "content/common/content_export.h"
19 #include "content/public/common/speech_recognition_error.h"
20 #include "net/url_request/url_fetcher_delegate.h"
21 
22 namespace net {
23 class URLRequestContextGetter;
24 }
25 
26 namespace content {
27 
28 class AudioChunk;
29 struct SpeechRecognitionError;
30 struct SpeechRecognitionResult;
31 
32 // Implements a SpeechRecognitionEngine supporting continuous recognition by
33 // means of interaction with Google streaming speech recognition webservice.
34 // More in details, this class establishes two HTTP(S) connections with the
35 // webservice, for each session, herein called "upstream" and "downstream".
36 // Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
37 // Recognition results are retrieved in a full-duplex fashion (i.e. while
38 // pushing audio on the upstream) on the downstream by means of a chunked
39 // HTTP GET request. Pairing between the two stream is handled through a
40 // randomly generated key, unique for each request, which is passed in the
41 // &pair= arg to both stream request URLs.
42 // In the case of a regular session, the upstream is closed when the audio
43 // capture ends (notified through a |AudioChunksEnded| call) and the downstream
44 // waits for a corresponding server closure (eventually some late results can
45 // come after closing the upstream).
46 // Both stream are guaranteed to be closed when |EndRecognition| call is issued.
47 class CONTENT_EXPORT GoogleStreamingRemoteEngine
NON_EXPORTED_BASE(SpeechRecognitionEngine)48     : public NON_EXPORTED_BASE(SpeechRecognitionEngine),
49       public net::URLFetcherDelegate,
50       public NON_EXPORTED_BASE(base::NonThreadSafe) {
51  public:
52   // Duration of each audio packet.
53   static const int kAudioPacketIntervalMs;
54 
55   // IDs passed to URLFetcher::Create(). Used for testing.
56   static const int kUpstreamUrlFetcherIdForTesting;
57   static const int kDownstreamUrlFetcherIdForTesting;
58 
59   explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context);
60   virtual ~GoogleStreamingRemoteEngine();
61 
62   // SpeechRecognitionEngine methods.
63   virtual void SetConfig(const SpeechRecognitionEngineConfig& config) OVERRIDE;
64   virtual void StartRecognition() OVERRIDE;
65   virtual void EndRecognition() OVERRIDE;
66   virtual void TakeAudioChunk(const AudioChunk& data) OVERRIDE;
67   virtual void AudioChunksEnded() OVERRIDE;
68   virtual bool IsRecognitionPending() const OVERRIDE;
69   virtual int GetDesiredAudioChunkDurationMs() const OVERRIDE;
70 
71   // net::URLFetcherDelegate methods.
72   virtual void OnURLFetchComplete(const net::URLFetcher* source) OVERRIDE;
73   virtual void OnURLFetchDownloadProgress(const net::URLFetcher* source,
74                                           int64 current, int64 total) OVERRIDE;
75 
76  private:
77   // Response status codes from the speech recognition webservice.
78   static const int kWebserviceStatusNoError;
79   static const int kWebserviceStatusErrorNoMatch;
80 
81   // Data types for the internal Finite State Machine (FSM).
82   enum FSMState {
83     STATE_IDLE = 0,
84     STATE_BOTH_STREAMS_CONNECTED,
85     STATE_WAITING_DOWNSTREAM_RESULTS,
86     STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
87   };
88 
89   enum FSMEvent {
90     EVENT_END_RECOGNITION = 0,
91     EVENT_START_RECOGNITION,
92     EVENT_AUDIO_CHUNK,
93     EVENT_AUDIO_CHUNKS_ENDED,
94     EVENT_UPSTREAM_ERROR,
95     EVENT_DOWNSTREAM_ERROR,
96     EVENT_DOWNSTREAM_RESPONSE,
97     EVENT_DOWNSTREAM_CLOSED,
98     EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
99   };
100 
101   struct FSMEventArgs {
102     explicit FSMEventArgs(FSMEvent event_value);
103     ~FSMEventArgs();
104 
105     FSMEvent event;
106 
107     // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
108     scoped_refptr<const AudioChunk> audio_data;
109 
110     // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
111     scoped_ptr<std::vector<uint8> > response;
112 
113    private:
114     DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
115   };
116 
117   // Invoked by both upstream and downstream URLFetcher callbacks to handle
118   // new chunk data, connection closed or errors notifications.
119   void DispatchHTTPResponse(const net::URLFetcher* source,
120                             bool end_of_response);
121 
122   // Entry point for pushing any new external event into the recognizer FSM.
123   void DispatchEvent(const FSMEventArgs& event_args);
124 
125   // Defines the behavior of the recognizer FSM, selecting the appropriate
126   // transition according to the current state and event.
127   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
128 
129   // The methods below handle transitions of the recognizer FSM.
130   FSMState ConnectBothStreams(const FSMEventArgs& event_args);
131   FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
132   FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
133   FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
134   FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
135   FSMState CloseDownstream(const FSMEventArgs& event_args);
136   FSMState AbortSilently(const FSMEventArgs& event_args);
137   FSMState AbortWithError(const FSMEventArgs& event_args);
138   FSMState Abort(SpeechRecognitionErrorCode error);
139   FSMState DoNothing(const FSMEventArgs& event_args);
140   FSMState NotFeasible(const FSMEventArgs& event_args);
141 
142   std::string GetAcceptedLanguages() const;
143   std::string GenerateRequestKey() const;
144 
145   SpeechRecognitionEngineConfig config_;
146   scoped_ptr<net::URLFetcher> upstream_fetcher_;
147   scoped_ptr<net::URLFetcher> downstream_fetcher_;
148   scoped_refptr<net::URLRequestContextGetter> url_context_;
149   scoped_ptr<AudioEncoder> encoder_;
150   ChunkedByteBuffer chunked_byte_buffer_;
151   size_t previous_response_length_;
152   bool got_last_definitive_result_;
153   bool is_dispatching_event_;
154   FSMState state_;
155 
156   DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
157 };
158 
159 }  // namespace content
160 
161 #endif  // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
162