1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/search_engines/template_url_parser.h"
6
7 #include <algorithm>
8 #include <map>
9 #include <vector>
10
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "chrome/browser/search_engines/template_url.h"
17 #include "chrome/browser/search_engines/ui_thread_search_terms_data.h"
18 #include "libxml/parser.h"
19 #include "libxml/xmlwriter.h"
20 #include "ui/gfx/favicon_size.h"
21 #include "url/gurl.h"
22 #include "url/url_constants.h"
23
24 namespace {
25
26 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
27 // to that of char, the following names are all in terms of char. This avoids
28 // having to convert to wide, then do comparisons.
29
30 // Defines for element names of the OSD document:
31 const char kURLElement[] = "Url";
32 const char kParamElement[] = "Param";
33 const char kShortNameElement[] = "ShortName";
34 const char kImageElement[] = "Image";
35 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
36 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
37 const char kInputEncodingElement[] = "InputEncoding";
38
39 // Various XML attributes used.
40 const char kURLTypeAttribute[] = "type";
41 const char kURLTemplateAttribute[] = "template";
42 const char kImageTypeAttribute[] = "type";
43 const char kImageWidthAttribute[] = "width";
44 const char kImageHeightAttribute[] = "height";
45 const char kParamNameAttribute[] = "name";
46 const char kParamValueAttribute[] = "value";
47 const char kParamMethodAttribute[] = "method";
48
49 // Mime type for search results.
50 const char kHTMLType[] = "text/html";
51
52 // Mime type for as you type suggestions.
53 const char kSuggestionType[] = "application/x-suggestions+json";
54
XMLCharToString(const xmlChar * value)55 std::string XMLCharToString(const xmlChar* value) {
56 return std::string(reinterpret_cast<const char*>(value));
57 }
58
59 // Returns true if input_encoding contains a valid input encoding string. This
60 // doesn't verify that we have a valid encoding for the string, just that the
61 // string contains characters that constitute a valid input encoding.
IsValidEncodingString(const std::string & input_encoding)62 bool IsValidEncodingString(const std::string& input_encoding) {
63 if (input_encoding.empty())
64 return false;
65
66 if (!IsAsciiAlpha(input_encoding[0]))
67 return false;
68
69 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
70 char c = input_encoding[i];
71 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
72 c != '-') {
73 return false;
74 }
75 }
76 return true;
77 }
78
AppendParamToQuery(const std::string & key,const std::string & value,std::string * query)79 void AppendParamToQuery(const std::string& key,
80 const std::string& value,
81 std::string* query) {
82 if (!query->empty())
83 query->append("&");
84 if (!key.empty()) {
85 query->append(key);
86 query->append("=");
87 }
88 query->append(value);
89 }
90
91 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
IsHTTPRef(const std::string & url)92 bool IsHTTPRef(const std::string& url) {
93 if (url.empty())
94 return true;
95 GURL gurl(url);
96 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
97 gurl.SchemeIs(url::kHttpsScheme));
98 }
99
100 } // namespace
101
102
103 // TemplateURLParsingContext --------------------------------------------------
104
105 // To minimize memory overhead while parsing, a SAX style parser is used.
106 // TemplateURLParsingContext is used to maintain the state we're in the document
107 // while parsing.
108 class TemplateURLParsingContext {
109 public:
110 // Enum of the known element types.
111 enum ElementType {
112 UNKNOWN,
113 OPEN_SEARCH_DESCRIPTION,
114 URL,
115 PARAM,
116 SHORT_NAME,
117 IMAGE,
118 INPUT_ENCODING,
119 };
120
121 enum Method {
122 GET,
123 POST
124 };
125
126 // Key/value of a Param node.
127 typedef std::pair<std::string, std::string> Param;
128
129 explicit TemplateURLParsingContext(
130 TemplateURLParser::ParameterFilter* parameter_filter);
131
132 static void StartElementImpl(void* ctx,
133 const xmlChar* name,
134 const xmlChar** atts);
135 static void EndElementImpl(void* ctx, const xmlChar* name);
136 static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
137
138 // Returns a heap-allocated TemplateURL representing the result of parsing.
139 // This will be NULL if parsing failed or if the results were invalid for some
140 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
141 // a resulting TemplateURLRef was invalid, etc.).
142 TemplateURL* GetTemplateURL(Profile* profile, bool show_in_default_list);
143
144 private:
145 // Key is UTF8 encoded.
146 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
147
148 static void InitMapping();
149
150 void ParseURL(const xmlChar** atts);
151 void ParseImage(const xmlChar** atts);
152 void ParseParam(const xmlChar** atts);
153 void ProcessURLParams();
154
155 // Returns the current ElementType.
156 ElementType GetKnownType();
157
158 static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
159
160 // Data that gets updated as we parse, and is converted to a TemplateURL by
161 // GetTemplateURL().
162 TemplateURLData data_;
163
164 std::vector<ElementType> elements_;
165 bool image_is_valid_for_favicon_;
166
167 // Character content for the current element.
168 base::string16 string_;
169
170 TemplateURLParser::ParameterFilter* parameter_filter_;
171
172 // The list of parameters parsed in the Param nodes of a Url node.
173 std::vector<Param> extra_params_;
174
175 // The HTTP methods used.
176 Method method_;
177 Method suggestion_method_;
178
179 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
180 // search. Note that we don't need a stack as URL nodes cannot be nested.
181 bool is_suggest_url_;
182
183 // Whether we should derive the image from the URL (when images are data
184 // URLs).
185 bool derive_image_from_url_;
186
187 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
188 };
189
190 // static
191 TemplateURLParsingContext::ElementNameToElementTypeMap*
192 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
193
TemplateURLParsingContext(TemplateURLParser::ParameterFilter * parameter_filter)194 TemplateURLParsingContext::TemplateURLParsingContext(
195 TemplateURLParser::ParameterFilter* parameter_filter)
196 : image_is_valid_for_favicon_(false),
197 parameter_filter_(parameter_filter),
198 method_(GET),
199 suggestion_method_(GET),
200 is_suggest_url_(false),
201 derive_image_from_url_(false) {
202 if (kElementNameToElementTypeMap == NULL)
203 InitMapping();
204 }
205
206 // static
StartElementImpl(void * ctx,const xmlChar * name,const xmlChar ** atts)207 void TemplateURLParsingContext::StartElementImpl(void* ctx,
208 const xmlChar* name,
209 const xmlChar** atts) {
210 // Remove the namespace from |name|, ex: os:Url -> Url.
211 std::string node_name(XMLCharToString(name));
212 size_t index = node_name.find_first_of(":");
213 if (index != std::string::npos)
214 node_name.erase(0, index + 1);
215
216 TemplateURLParsingContext* context =
217 reinterpret_cast<TemplateURLParsingContext*>(ctx);
218 context->elements_.push_back(
219 context->kElementNameToElementTypeMap->count(node_name) ?
220 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
221 switch (context->GetKnownType()) {
222 case TemplateURLParsingContext::URL:
223 context->extra_params_.clear();
224 context->ParseURL(atts);
225 break;
226 case TemplateURLParsingContext::IMAGE:
227 context->ParseImage(atts);
228 break;
229 case TemplateURLParsingContext::PARAM:
230 context->ParseParam(atts);
231 break;
232 default:
233 break;
234 }
235 context->string_.clear();
236 }
237
238 // static
EndElementImpl(void * ctx,const xmlChar * name)239 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
240 TemplateURLParsingContext* context =
241 reinterpret_cast<TemplateURLParsingContext*>(ctx);
242 switch (context->GetKnownType()) {
243 case TemplateURLParsingContext::SHORT_NAME:
244 context->data_.short_name = context->string_;
245 break;
246 case TemplateURLParsingContext::IMAGE: {
247 GURL image_url(base::UTF16ToUTF8(context->string_));
248 if (image_url.SchemeIs(url::kDataScheme)) {
249 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
250 // decode the data URL in the renderer. For now, we'll just point to the
251 // favicon from the URL.
252 context->derive_image_from_url_ = true;
253 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
254 (image_url.SchemeIs(url::kHttpScheme) ||
255 image_url.SchemeIs(url::kHttpsScheme))) {
256 context->data_.favicon_url = image_url;
257 }
258 context->image_is_valid_for_favicon_ = false;
259 break;
260 }
261 case TemplateURLParsingContext::INPUT_ENCODING: {
262 std::string input_encoding = base::UTF16ToASCII(context->string_);
263 if (IsValidEncodingString(input_encoding))
264 context->data_.input_encodings.push_back(input_encoding);
265 break;
266 }
267 case TemplateURLParsingContext::URL:
268 context->ProcessURLParams();
269 break;
270 default:
271 break;
272 }
273 context->string_.clear();
274 context->elements_.pop_back();
275 }
276
277 // static
CharactersImpl(void * ctx,const xmlChar * ch,int len)278 void TemplateURLParsingContext::CharactersImpl(void* ctx,
279 const xmlChar* ch,
280 int len) {
281 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
282 base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
283 }
284
GetTemplateURL(Profile * profile,bool show_in_default_list)285 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
286 Profile* profile,
287 bool show_in_default_list) {
288 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
289 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
290 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
291 return NULL;
292 if (suggestion_method_ == TemplateURLParsingContext::POST)
293 data_.suggestions_url.clear();
294
295 // If the image was a data URL, use the favicon from the search URL instead.
296 // (see the TODO in EndElementImpl()).
297 GURL search_url(data_.url());
298 if (derive_image_from_url_ && data_.favicon_url.is_empty())
299 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
300
301 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
302 data_.show_in_default_list = show_in_default_list;
303
304 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
305 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
306 scoped_ptr<SearchTermsData> search_terms_data(profile ?
307 new UIThreadSearchTermsData(profile) : new SearchTermsData());
308 if (template_url->url().empty() ||
309 !template_url->url_ref().IsValid(*search_terms_data) ||
310 (!template_url->suggestions_url().empty() &&
311 !template_url->suggestions_url_ref().IsValid(*search_terms_data))) {
312 return NULL;
313 }
314
315 return template_url.release();
316 }
317
318 // static
InitMapping()319 void TemplateURLParsingContext::InitMapping() {
320 kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
321 (*kElementNameToElementTypeMap)[kURLElement] = URL;
322 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
323 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
324 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
325 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
326 OPEN_SEARCH_DESCRIPTION;
327 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
328 OPEN_SEARCH_DESCRIPTION;
329 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
330 }
331
ParseURL(const xmlChar ** atts)332 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
333 if (!atts)
334 return;
335
336 std::string template_url;
337 bool is_post = false;
338 bool is_html_url = false;
339 bool is_suggest_url = false;
340 for (; *atts; atts += 2) {
341 std::string name(XMLCharToString(*atts));
342 const xmlChar* value = atts[1];
343 if (name == kURLTypeAttribute) {
344 std::string type = XMLCharToString(value);
345 is_html_url = (type == kHTMLType);
346 is_suggest_url = (type == kSuggestionType);
347 } else if (name == kURLTemplateAttribute) {
348 template_url = XMLCharToString(value);
349 } else if (name == kParamMethodAttribute) {
350 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
351 }
352 }
353
354 if (is_html_url && !template_url.empty()) {
355 data_.SetURL(template_url);
356 is_suggest_url_ = false;
357 if (is_post)
358 method_ = POST;
359 } else if (is_suggest_url) {
360 data_.suggestions_url = template_url;
361 is_suggest_url_ = true;
362 if (is_post)
363 suggestion_method_ = POST;
364 }
365 }
366
ParseImage(const xmlChar ** atts)367 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
368 if (!atts)
369 return;
370
371 int width = 0;
372 int height = 0;
373 std::string type;
374 for (; *atts; atts += 2) {
375 std::string name(XMLCharToString(*atts));
376 const xmlChar* value = atts[1];
377 if (name == kImageTypeAttribute) {
378 type = XMLCharToString(value);
379 } else if (name == kImageWidthAttribute) {
380 base::StringToInt(XMLCharToString(value), &width);
381 } else if (name == kImageHeightAttribute) {
382 base::StringToInt(XMLCharToString(value), &height);
383 }
384 }
385
386 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
387 (height == gfx::kFaviconSize) &&
388 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
389 }
390
ParseParam(const xmlChar ** atts)391 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
392 if (!atts)
393 return;
394
395 std::string key, value;
396 for (; *atts; atts += 2) {
397 std::string name(XMLCharToString(*atts));
398 const xmlChar* val = atts[1];
399 if (name == kParamNameAttribute) {
400 key = XMLCharToString(val);
401 } else if (name == kParamValueAttribute) {
402 value = XMLCharToString(val);
403 }
404 }
405
406 if (!key.empty() &&
407 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
408 extra_params_.push_back(Param(key, value));
409 }
410
ProcessURLParams()411 void TemplateURLParsingContext::ProcessURLParams() {
412 if (!parameter_filter_ && extra_params_.empty())
413 return;
414
415 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
416 if (url.is_empty())
417 return;
418
419 // If there is a parameter filter, parse the existing URL and remove any
420 // unwanted parameter.
421 std::string new_query;
422 bool modified = false;
423 if (parameter_filter_) {
424 url::Component query = url.parsed_for_possibly_invalid_spec().query;
425 url::Component key, value;
426 const char* url_spec = url.spec().c_str();
427 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
428 std::string key_str(url_spec, key.begin, key.len);
429 std::string value_str(url_spec, value.begin, value.len);
430 if (parameter_filter_->KeepParameter(key_str, value_str)) {
431 AppendParamToQuery(key_str, value_str, &new_query);
432 } else {
433 modified = true;
434 }
435 }
436 }
437 if (!modified)
438 new_query = url.query();
439
440 // Add the extra parameters if any.
441 if (!extra_params_.empty()) {
442 modified = true;
443 for (std::vector<Param>::const_iterator iter(extra_params_.begin());
444 iter != extra_params_.end(); ++iter)
445 AppendParamToQuery(iter->first, iter->second, &new_query);
446 }
447
448 if (modified) {
449 GURL::Replacements repl;
450 repl.SetQueryStr(new_query);
451 url = url.ReplaceComponents(repl);
452 if (is_suggest_url_)
453 data_.suggestions_url = url.spec();
454 else if (url.is_valid())
455 data_.SetURL(url.spec());
456 }
457 }
458
459 TemplateURLParsingContext::ElementType
GetKnownType()460 TemplateURLParsingContext::GetKnownType() {
461 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
462 return elements_[1];
463 // We only expect PARAM nodes under the URL node.
464 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
465 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
466 }
467
468
469 // TemplateURLParser ----------------------------------------------------------
470
471 // static
Parse(Profile * profile,bool show_in_default_list,const char * data,size_t length,TemplateURLParser::ParameterFilter * param_filter)472 TemplateURL* TemplateURLParser::Parse(
473 Profile* profile,
474 bool show_in_default_list,
475 const char* data,
476 size_t length,
477 TemplateURLParser::ParameterFilter* param_filter) {
478 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to
479 // & . Unfortunately xmlSubstituteEntitiesDefault affects global state.
480 // If this becomes problematic we'll need to provide our own entity
481 // type for &, or strip out & by hand after parsing.
482 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
483 TemplateURLParsingContext context(param_filter);
484 xmlSAXHandler sax_handler;
485 memset(&sax_handler, 0, sizeof(sax_handler));
486 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
487 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
488 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
489 int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
490 static_cast<int>(length));
491 xmlSubstituteEntitiesDefault(last_sub_entities_value);
492
493 return error ? NULL : context.GetTemplateURL(profile, show_in_default_list);
494 }
495