• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "content/browser/speech/google_one_shot_remote_engine.h"
6 
7 #include <vector>
8 
9 #include "base/json/json_reader.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/values.h"
13 #include "content/browser/speech/audio_buffer.h"
14 #include "content/public/common/speech_recognition_error.h"
15 #include "content/public/common/speech_recognition_result.h"
16 #include "google_apis/google_api_keys.h"
17 #include "net/base/escape.h"
18 #include "net/base/load_flags.h"
19 #include "net/url_request/http_user_agent_settings.h"
20 #include "net/url_request/url_fetcher.h"
21 #include "net/url_request/url_request_context.h"
22 #include "net/url_request/url_request_context_getter.h"
23 #include "net/url_request/url_request_status.h"
24 
25 namespace content {
26 namespace {
27 
28 const char* const kDefaultSpeechRecognitionUrl =
29     "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
30 const char* const kStatusString = "status";
31 const char* const kHypothesesString = "hypotheses";
32 const char* const kUtteranceString = "utterance";
33 const char* const kConfidenceString = "confidence";
34 const int kWebServiceStatusNoError = 0;
35 const int kWebServiceStatusNoSpeech = 4;
36 const int kWebServiceStatusNoMatch = 5;
37 const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
38 
ParseServerResponse(const std::string & response_body,SpeechRecognitionResult * result,SpeechRecognitionError * error)39 bool ParseServerResponse(const std::string& response_body,
40                          SpeechRecognitionResult* result,
41                          SpeechRecognitionError* error) {
42   if (response_body.empty()) {
43     LOG(WARNING) << "ParseServerResponse: Response was empty.";
44     return false;
45   }
46   DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
47 
48   // Parse the response, ignoring comments.
49   std::string error_msg;
50   scoped_ptr<base::Value> response_value(base::JSONReader::ReadAndReturnError(
51       response_body, base::JSON_PARSE_RFC, NULL, &error_msg));
52   if (response_value == NULL) {
53     LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg;
54     return false;
55   }
56 
57   if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) {
58     VLOG(1) << "ParseServerResponse: Unexpected response type "
59             << response_value->GetType();
60     return false;
61   }
62   const base::DictionaryValue* response_object =
63       static_cast<const base::DictionaryValue*>(response_value.get());
64 
65   // Get the status.
66   int status;
67   if (!response_object->GetInteger(kStatusString, &status)) {
68     VLOG(1) << "ParseServerResponse: " << kStatusString
69             << " is not a valid integer value.";
70     return false;
71   }
72 
73   // Process the status.
74   switch (status) {
75     case kWebServiceStatusNoError:
76       break;
77     case kWebServiceStatusNoSpeech:
78       error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH;
79       return false;
80     case kWebServiceStatusNoMatch:
81       error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH;
82       return false;
83     default:
84       error->code = SPEECH_RECOGNITION_ERROR_NETWORK;
85       // Other status codes should not be returned by the server.
86       VLOG(1) << "ParseServerResponse: unexpected status code " << status;
87       return false;
88   }
89 
90   // Get the hypotheses.
91   const base::Value* hypotheses_value = NULL;
92   if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
93     VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
94     return false;
95   }
96 
97   DCHECK(hypotheses_value);
98   if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) {
99     VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
100             << hypotheses_value->GetType();
101     return false;
102   }
103 
104   const base::ListValue* hypotheses_list =
105       static_cast<const base::ListValue*>(hypotheses_value);
106 
107   // For now we support only single shot recognition, so we are giving only a
108   // final result, consisting of one fragment (with one or more hypotheses).
109   size_t index = 0;
110   for (; index < hypotheses_list->GetSize(); ++index) {
111     const base::Value* hypothesis = NULL;
112     if (!hypotheses_list->Get(index, &hypothesis)) {
113       LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
114       break;
115     }
116     DCHECK(hypothesis);
117     if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) {
118       LOG(WARNING) << "ParseServerResponse: Unexpected value type "
119                    << hypothesis->GetType();
120       break;
121     }
122 
123     const base::DictionaryValue* hypothesis_value =
124         static_cast<const base::DictionaryValue*>(hypothesis);
125     base::string16 utterance;
126 
127     if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
128       LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
129       break;
130     }
131 
132     // It is not an error if the 'confidence' field is missing.
133     double confidence = 0.0;
134     hypothesis_value->GetDouble(kConfidenceString, &confidence);
135     result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
136                                                              confidence));
137   }
138 
139   if (index < hypotheses_list->GetSize()) {
140     result->hypotheses.clear();
141     return false;
142   }
143   return true;
144 }
145 
146 }  // namespace
147 
148 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;
149 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;
150 
GoogleOneShotRemoteEngine(net::URLRequestContextGetter * context)151 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
152     net::URLRequestContextGetter* context)
153     : url_context_(context) {
154 }
155 
~GoogleOneShotRemoteEngine()156 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
157 
SetConfig(const SpeechRecognitionEngineConfig & config)158 void GoogleOneShotRemoteEngine::SetConfig(
159     const SpeechRecognitionEngineConfig& config) {
160   config_ = config;
161 }
162 
StartRecognition()163 void GoogleOneShotRemoteEngine::StartRecognition() {
164   DCHECK(delegate());
165   DCHECK(!url_fetcher_.get());
166   std::string lang_param = config_.language;
167 
168   if (lang_param.empty() && url_context_.get()) {
169     // If no language is provided then we use the first from the accepted
170     // language list. If this list is empty then it defaults to "en-US".
171     // Example of the contents of this list: "es,en-GB;q=0.8", ""
172     net::URLRequestContext* request_context =
173         url_context_->GetURLRequestContext();
174     DCHECK(request_context);
175     // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with
176     // a reference to the HttpUserAgentSettings rather than accessing the
177     // accept language through the URLRequestContext.
178     if (request_context->http_user_agent_settings()) {
179       std::string accepted_language_list =
180           request_context->http_user_agent_settings()->GetAcceptLanguage();
181       size_t separator = accepted_language_list.find_first_of(",;");
182       lang_param = accepted_language_list.substr(0, separator);
183     }
184   }
185 
186   if (lang_param.empty())
187     lang_param = "en-US";
188 
189   std::vector<std::string> parts;
190   parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
191 
192   if (!config_.grammars.empty()) {
193     DCHECK_EQ(config_.grammars.size(), 1U);
194     parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url,
195                                                        true));
196   }
197 
198   if (!config_.hardware_info.empty())
199     parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
200                                                         true));
201   parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses));
202   parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
203 
204   std::string api_key = google_apis::GetAPIKey();
205   parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true));
206 
207   GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
208 
209   encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
210                                       config_.audio_sample_rate,
211                                       config_.audio_num_bits_per_sample));
212   DCHECK(encoder_.get());
213   url_fetcher_.reset(net::URLFetcher::Create(url_fetcher_id_for_tests,
214                                              url,
215                                              net::URLFetcher::POST,
216                                              this));
217   url_fetcher_->SetChunkedUpload(encoder_->mime_type());
218   url_fetcher_->SetRequestContext(url_context_.get());
219   url_fetcher_->SetReferrer(config_.origin_url);
220 
221   // The speech recognition API does not require user identification as part
222   // of requests, so we don't send cookies or auth data for these requests to
223   // prevent any accidental connection between users who are logged into the
224   // domain for other services (e.g. bookmark sync) with the speech requests.
225   url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
226                              net::LOAD_DO_NOT_SEND_COOKIES |
227                              net::LOAD_DO_NOT_SEND_AUTH_DATA);
228   url_fetcher_->Start();
229 }
230 
EndRecognition()231 void GoogleOneShotRemoteEngine::EndRecognition() {
232   url_fetcher_.reset();
233 }
234 
TakeAudioChunk(const AudioChunk & data)235 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
236   DCHECK(url_fetcher_.get());
237   DCHECK(encoder_.get());
238   DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
239   encoder_->Encode(data);
240   scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
241   url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
242 }
243 
AudioChunksEnded()244 void GoogleOneShotRemoteEngine::AudioChunksEnded() {
245   DCHECK(url_fetcher_.get());
246   DCHECK(encoder_.get());
247 
248   // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
249   // of silence in case encoder had no data already.
250   std::vector<int16> samples(
251       config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
252   scoped_refptr<AudioChunk> dummy_chunk(
253       new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
254                      samples.size() * sizeof(int16),
255                      encoder_->bits_per_sample() / 8));
256   encoder_->Encode(*dummy_chunk.get());
257   encoder_->Flush();
258   scoped_refptr<AudioChunk> encoded_dummy_data(
259       encoder_->GetEncodedDataAndClear());
260   DCHECK(!encoded_dummy_data->IsEmpty());
261   encoder_.reset();
262 
263   url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
264 }
265 
OnURLFetchComplete(const net::URLFetcher * source)266 void GoogleOneShotRemoteEngine::OnURLFetchComplete(
267     const net::URLFetcher* source) {
268   DCHECK_EQ(url_fetcher_.get(), source);
269   SpeechRecognitionResults results;
270   results.push_back(SpeechRecognitionResult());
271   SpeechRecognitionResult& result = results.back();
272   SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK);
273   std::string data;
274 
275   // The default error code in case of parse errors is NETWORK_FAILURE, however
276   // ParseServerResponse can change the error to a more appropriate one.
277   bool error_occurred = (!source->GetStatus().is_success() ||
278                         source->GetResponseCode() != 200 ||
279                         !source->GetResponseAsString(&data) ||
280                         !ParseServerResponse(data, &result, &error));
281   url_fetcher_.reset();
282   if (error_occurred) {
283     DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;
284     delegate()->OnSpeechRecognitionEngineError(error);
285   } else {
286     DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
287     delegate()->OnSpeechRecognitionEngineResults(results);
288   }
289 }
290 
IsRecognitionPending() const291 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
292   return url_fetcher_ != NULL;
293 }
294 
GetDesiredAudioChunkDurationMs() const295 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
296   return kAudioPacketIntervalMs;
297 }
298 
299 }  // namespace content
300