1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/search_engines/template_url_parser.h"
6
7 #include <algorithm>
8 #include <map>
9 #include <vector>
10
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/string_number_conversions.h"
14 #include "base/string_util.h"
15 #include "base/utf_string_conversions.h"
16 #include "chrome/browser/search_engines/template_url.h"
17 #include "chrome/common/url_constants.h"
18 #include "googleurl/src/gurl.h"
19 #include "libxml/parser.h"
20 #include "libxml/xmlwriter.h"
21
22 namespace {
23
24 //
25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26 // to that of char, the following names are all in terms of char. This avoids
27 // having to convert to wide, then do comparisons
28
29 // Defines for element names of the OSD document:
30 static const char kURLElement[] = "Url";
31 static const char kParamElement[] = "Param";
32 static const char kShortNameElement[] = "ShortName";
33 static const char kDescriptionElement[] = "Description";
34 static const char kImageElement[] = "Image";
35 static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
36 static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
37 static const char kLanguageElement[] = "Language";
38 static const char kInputEncodingElement[] = "InputEncoding";
39
40 // Various XML attributes used.
41 static const char kURLTypeAttribute[] = "type";
42 static const char kURLTemplateAttribute[] = "template";
43 static const char kImageTypeAttribute[] = "type";
44 static const char kImageWidthAttribute[] = "width";
45 static const char kImageHeightAttribute[] = "height";
46 static const char kURLIndexOffsetAttribute[] = "indexOffset";
47 static const char kURLPageOffsetAttribute[] = "pageOffset";
48 static const char kParamNameAttribute[] = "name";
49 static const char kParamValueAttribute[] = "value";
50 static const char kParamMethodAttribute[] = "method";
51
52 // Mime type for search results.
53 static const char kHTMLType[] = "text/html";
54
55 // Mime type for as you type suggestions.
56 static const char kSuggestionType[] = "application/x-suggestions+json";
57
58 // Namespace identifier.
59 static const char kOSDNS[] = "xmlns";
60
61 // The namespace for documents we understand.
62 static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";
63
64 // Removes the namespace from the specified |name|, ex: os:Url -> Url.
PruneNamespace(std::string * name)65 static void PruneNamespace(std::string* name) {
66 size_t index = name->find_first_of(":");
67 if (index != std::string::npos)
68 name->erase(0, index + 1);
69 }
70
71 //
72 // To minimize memory overhead while parsing, a SAX style parser is used.
73 // ParsingContext is used to maintain the state we're in the document
74 // while parsing.
75 class ParsingContext {
76 public:
77 // Enum of the known element types.
78 enum ElementType {
79 UNKNOWN,
80 OPEN_SEARCH_DESCRIPTION,
81 URL,
82 PARAM,
83 SHORT_NAME,
84 DESCRIPTION,
85 IMAGE,
86 LANGUAGE,
87 INPUT_ENCODING,
88 };
89
90 enum Method {
91 GET,
92 POST
93 };
94
95 // Key/value of a Param node.
96 typedef std::pair<std::string, std::string> Param;
97
ParsingContext(TemplateURLParser::ParameterFilter * parameter_filter,TemplateURL * url)98 ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter,
99 TemplateURL* url)
100 : url_(url),
101 parameter_filter_(parameter_filter),
102 method_(GET),
103 suggestion_method_(GET),
104 is_suggest_url_(false),
105 derive_image_from_url_(false) {
106 if (kElementNameToElementTypeMap == NULL)
107 InitMapping();
108 }
109
110 // Invoked when an element starts.
PushElement(const std::string & element)111 void PushElement(const std::string& element) {
112 ElementType type;
113 if (kElementNameToElementTypeMap->find(element) ==
114 kElementNameToElementTypeMap->end()) {
115 type = UNKNOWN;
116 } else {
117 type = (*kElementNameToElementTypeMap)[element];
118 }
119 elements_.push_back(type);
120 }
121
PopElement()122 void PopElement() {
123 elements_.pop_back();
124 }
125
126 // Returns the current ElementType.
GetKnownType()127 ElementType GetKnownType() {
128 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
129 return elements_[1];
130
131 // We only expect PARAM nodes under the Url node
132 if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
133 elements_[1] == URL && elements_[2] == PARAM)
134 return PARAM;
135
136 return UNKNOWN;
137 }
138
template_url()139 TemplateURL* template_url() { return url_; }
140
AddImageRef(const std::string & type,int width,int height)141 void AddImageRef(const std::string& type, int width, int height) {
142 if (width > 0 && height > 0)
143 current_image_.reset(new TemplateURL::ImageRef(type, width, height));
144 }
145
EndImage()146 void EndImage() {
147 current_image_.reset();
148 }
149
SetImageURL(const GURL & url)150 void SetImageURL(const GURL& url) {
151 if (current_image_.get()) {
152 current_image_->url = url;
153 url_->add_image_ref(*current_image_);
154 current_image_.reset();
155 }
156 }
157
ResetString()158 void ResetString() {
159 string_.clear();
160 }
161
AppendString(const string16 & string)162 void AppendString(const string16& string) {
163 string_ += string;
164 }
165
GetString()166 const string16& GetString() {
167 return string_;
168 }
169
ResetExtraParams()170 void ResetExtraParams() {
171 extra_params_.clear();
172 }
173
AddExtraParams(const std::string & key,const std::string & value)174 void AddExtraParams(const std::string& key, const std::string& value) {
175 if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value))
176 return;
177 extra_params_.push_back(Param(key, value));
178 }
179
extra_params() const180 const std::vector<Param>& extra_params() const { return extra_params_; }
181
set_is_suggestion(bool value)182 void set_is_suggestion(bool value) { is_suggest_url_ = value; }
is_suggestion() const183 bool is_suggestion() const { return is_suggest_url_; }
184
parameter_filter() const185 TemplateURLParser::ParameterFilter* parameter_filter() const {
186 return parameter_filter_;
187 }
188
set_derive_image_from_url(bool derive_image_from_url)189 void set_derive_image_from_url(bool derive_image_from_url) {
190 derive_image_from_url_ = derive_image_from_url;
191 }
192
set_method(Method method)193 void set_method(Method method) { method_ = method; }
method()194 Method method() { return method_; }
195
set_suggestion_method(Method method)196 void set_suggestion_method(Method method) { suggestion_method_ = method; }
suggestion_method()197 Method suggestion_method() { return suggestion_method_; }
198
199 // Builds the image URL from the Template search URL if no image URL has been
200 // set.
DeriveImageFromURL()201 void DeriveImageFromURL() {
202 if (derive_image_from_url_ &&
203 url_->GetFaviconURL().is_empty() && url_->url()) {
204 GURL url(url_->url()->url()); // More url's please...
205 url_->SetFaviconURL(TemplateURL::GenerateFaviconURL(url));
206 }
207 }
208
209 private:
InitMapping()210 static void InitMapping() {
211 kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
212 (*kElementNameToElementTypeMap)[kURLElement] = URL;
213 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
214 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
215 (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION;
216 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
217 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
218 OPEN_SEARCH_DESCRIPTION;
219 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
220 OPEN_SEARCH_DESCRIPTION;
221 (*kElementNameToElementTypeMap)[kLanguageElement] =
222 LANGUAGE;
223 (*kElementNameToElementTypeMap)[kInputEncodingElement] =
224 INPUT_ENCODING;
225 }
226
227 // Key is UTF8 encoded.
228 static std::map<std::string, ElementType>* kElementNameToElementTypeMap;
229 // TemplateURL supplied to Read method. It's owned by the caller, so we
230 // don't need to free it.
231 TemplateURL* url_;
232 std::vector<ElementType> elements_;
233 scoped_ptr<TemplateURL::ImageRef> current_image_;
234
235 // Character content for the current element.
236 string16 string_;
237
238 TemplateURLParser::ParameterFilter* parameter_filter_;
239
240 // The list of parameters parsed in the Param nodes of a Url node.
241 std::vector<Param> extra_params_;
242
243 // The HTTP methods used.
244 Method method_;
245 Method suggestion_method_;
246
247 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
248 // search. Note that we don't need a stack as Url nodes cannot be nested.
249 bool is_suggest_url_;
250
251 // Whether we should derive the image from the URL (when images are data
252 // URLs).
253 bool derive_image_from_url_;
254
255 DISALLOW_COPY_AND_ASSIGN(ParsingContext);
256 };
257
258 // static
259 std::map<std::string, ParsingContext::ElementType>*
260 ParsingContext::kElementNameToElementTypeMap = NULL;
261
XMLCharToUTF16(const xmlChar * value,int length)262 string16 XMLCharToUTF16(const xmlChar* value, int length) {
263 return UTF8ToUTF16(std::string((const char*)value, length));
264 }
265
XMLCharToString(const xmlChar * value)266 std::string XMLCharToString(const xmlChar* value) {
267 return std::string((const char*)value);
268 }
269
270 // Returns true if input_encoding contains a valid input encoding string. This
271 // doesn't verify that we have a valid encoding for the string, just that the
272 // string contains characters that constitute a valid input encoding.
IsValidEncodingString(const std::string & input_encoding)273 bool IsValidEncodingString(const std::string& input_encoding) {
274 if (input_encoding.empty())
275 return false;
276
277 if (!IsAsciiAlpha(input_encoding[0]))
278 return false;
279
280 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
281 char c = input_encoding[i];
282 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
283 c != '-') {
284 return false;
285 }
286 }
287 return true;
288 }
289
ParseURL(const xmlChar ** atts,ParsingContext * context)290 void ParseURL(const xmlChar** atts, ParsingContext* context) {
291 if (!atts)
292 return;
293
294 TemplateURL* turl = context->template_url();
295 const xmlChar** attributes = atts;
296 std::string template_url;
297 bool is_post = false;
298 bool is_html_url = false;
299 bool is_suggest_url = false;
300 int index_offset = 1;
301 int page_offset = 1;
302
303 while (*attributes) {
304 std::string name(XMLCharToString(*attributes));
305 const xmlChar* value = attributes[1];
306 if (name == kURLTypeAttribute) {
307 std::string type = XMLCharToString(value);
308 is_html_url = (type == kHTMLType);
309 is_suggest_url = (type == kSuggestionType);
310 } else if (name == kURLTemplateAttribute) {
311 template_url = XMLCharToString(value);
312 } else if (name == kURLIndexOffsetAttribute) {
313 base::StringToInt(XMLCharToString(value), &index_offset);
314 index_offset = std::max(1, index_offset);
315 } else if (name == kURLPageOffsetAttribute) {
316 base::StringToInt(XMLCharToString(value), &page_offset);
317 page_offset = std::max(1, page_offset);
318 } else if (name == kParamMethodAttribute) {
319 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
320 }
321 attributes += 2;
322 }
323 if (is_html_url) {
324 turl->SetURL(template_url, index_offset, page_offset);
325 context->set_is_suggestion(false);
326 if (is_post)
327 context->set_method(ParsingContext::POST);
328 } else if (is_suggest_url) {
329 turl->SetSuggestionsURL(template_url, index_offset, page_offset);
330 context->set_is_suggestion(true);
331 if (is_post)
332 context->set_suggestion_method(ParsingContext::POST);
333 }
334 }
335
ParseImage(const xmlChar ** atts,ParsingContext * context)336 void ParseImage(const xmlChar** atts, ParsingContext* context) {
337 if (!atts)
338 return;
339
340 const xmlChar** attributes = atts;
341 int width = 0;
342 int height = 0;
343 std::string type;
344 while (*attributes) {
345 std::string name(XMLCharToString(*attributes));
346 const xmlChar* value = attributes[1];
347 if (name == kImageTypeAttribute) {
348 type = XMLCharToString(value);
349 } else if (name == kImageWidthAttribute) {
350 base::StringToInt(XMLCharToString(value), &width);
351 } else if (name == kImageHeightAttribute) {
352 base::StringToInt(XMLCharToString(value), &height);
353 }
354 attributes += 2;
355 }
356 if (width > 0 && height > 0 && !type.empty()) {
357 // Valid Image URL.
358 context->AddImageRef(type, width, height);
359 }
360 }
361
ParseParam(const xmlChar ** atts,ParsingContext * context)362 void ParseParam(const xmlChar** atts, ParsingContext* context) {
363 if (!atts)
364 return;
365
366 const xmlChar** attributes = atts;
367 std::string key, value;
368 while (*attributes) {
369 std::string name(XMLCharToString(*attributes));
370 const xmlChar* val = attributes[1];
371 if (name == kParamNameAttribute) {
372 key = XMLCharToString(val);
373 } else if (name == kParamValueAttribute) {
374 value = XMLCharToString(val);
375 }
376 attributes += 2;
377 }
378 if (!key.empty())
379 context->AddExtraParams(key, value);
380 }
381
AppendParamToQuery(const std::string & key,const std::string & value,std::string * query)382 static void AppendParamToQuery(const std::string& key,
383 const std::string& value,
384 std::string* query) {
385 if (!query->empty())
386 query->append("&");
387 if (!key.empty()) {
388 query->append(key);
389 query->append("=");
390 }
391 query->append(value);
392 }
393
ProcessURLParams(ParsingContext * context)394 void ProcessURLParams(ParsingContext* context) {
395 TemplateURL* t_url = context->template_url();
396 const TemplateURLRef* t_url_ref =
397 context->is_suggestion() ? t_url->suggestions_url() :
398 t_url->url();
399 if (!t_url_ref)
400 return;
401
402 if (!context->parameter_filter() && context->extra_params().empty())
403 return;
404
405 GURL url(t_url_ref->url());
406 // If there is a parameter filter, parse the existing URL and remove any
407 // unwanted parameter.
408 TemplateURLParser::ParameterFilter* filter = context->parameter_filter();
409 std::string new_query;
410 bool modified = false;
411 if (filter) {
412 url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
413 url_parse::Component key, value;
414 const char* url_spec = url.spec().c_str();
415 while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
416 std::string key_str(url_spec, key.begin, key.len);
417 std::string value_str(url_spec, value.begin, value.len);
418 if (filter->KeepParameter(key_str, value_str)) {
419 AppendParamToQuery(key_str, value_str, &new_query);
420 } else {
421 modified = true;
422 }
423 }
424 }
425 if (!modified)
426 new_query = url.query();
427
428 // Add the extra parameters if any.
429 const std::vector<ParsingContext::Param>& params = context->extra_params();
430 if (!params.empty()) {
431 modified = true;
432 std::vector<ParsingContext::Param>::const_iterator iter;
433 for (iter = params.begin(); iter != params.end(); ++iter)
434 AppendParamToQuery(iter->first, iter->second, &new_query);
435 }
436
437 if (modified) {
438 GURL::Replacements repl;
439 repl.SetQueryStr(new_query);
440 url = url.ReplaceComponents(repl);
441 if (context->is_suggestion()) {
442 t_url->SetSuggestionsURL(url.spec(),
443 t_url_ref->index_offset(),
444 t_url_ref->page_offset());
445 } else {
446 t_url->SetURL(url.spec(),
447 t_url_ref->index_offset(),
448 t_url_ref->page_offset());
449 }
450 }
451 }
452
StartElementImpl(void * ctx,const xmlChar * name,const xmlChar ** atts)453 void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) {
454 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
455 std::string node_name((const char*)name);
456 PruneNamespace(&node_name);
457 context->PushElement(node_name);
458 switch (context->GetKnownType()) {
459 case ParsingContext::URL:
460 context->ResetExtraParams();
461 ParseURL(atts, context);
462 break;
463 case ParsingContext::IMAGE:
464 ParseImage(atts, context);
465 break;
466 case ParsingContext::PARAM:
467 ParseParam(atts, context);
468 break;
469 default:
470 break;
471 }
472 context->ResetString();
473 }
474
EndElementImpl(void * ctx,const xmlChar * name)475 void EndElementImpl(void *ctx, const xmlChar *name) {
476 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
477 switch (context->GetKnownType()) {
478 case ParsingContext::SHORT_NAME:
479 context->template_url()->set_short_name(context->GetString());
480 break;
481 case ParsingContext::DESCRIPTION:
482 context->template_url()->set_description(context->GetString());
483 break;
484 case ParsingContext::IMAGE: {
485 GURL image_url(UTF16ToUTF8(context->GetString()));
486 if (image_url.SchemeIs(chrome::kDataScheme)) {
487 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
488 // decode the data URL in the renderer. For now, we'll just point to the
489 // favicon from the URL.
490 context->set_derive_image_from_url(true);
491 } else {
492 context->SetImageURL(image_url);
493 }
494 context->EndImage();
495 break;
496 }
497 case ParsingContext::LANGUAGE:
498 context->template_url()->add_language(context->GetString());
499 break;
500 case ParsingContext::INPUT_ENCODING: {
501 std::string input_encoding = UTF16ToASCII(context->GetString());
502 if (IsValidEncodingString(input_encoding))
503 context->template_url()->add_input_encoding(input_encoding);
504 break;
505 }
506 case ParsingContext::URL:
507 ProcessURLParams(context);
508 break;
509 default:
510 break;
511 }
512 context->ResetString();
513 context->PopElement();
514 }
515
CharactersImpl(void * ctx,const xmlChar * ch,int len)516 void CharactersImpl(void *ctx, const xmlChar *ch, int len) {
517 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
518 context->AppendString(XMLCharToUTF16(ch, len));
519 }
520
521 // Returns true if the ref is null, or the url wrapped by ref is
522 // valid with a spec of http/https.
IsHTTPRef(const TemplateURLRef * ref)523 bool IsHTTPRef(const TemplateURLRef* ref) {
524 if (ref == NULL)
525 return true;
526 GURL url(ref->url());
527 return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) ||
528 url.SchemeIs(chrome::kHttpsScheme)));
529 }
530
531 // Returns true if the TemplateURL is legal. A legal TemplateURL is one
532 // where all URLs have a spec of http/https.
IsLegal(TemplateURL * url)533 bool IsLegal(TemplateURL* url) {
534 if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url()))
535 return false;
536 // Make sure all the image refs are legal.
537 const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs();
538 for (size_t i = 0; i < image_refs.size(); i++) {
539 GURL image_url(image_refs[i].url);
540 if (!image_url.is_valid() ||
541 !(image_url.SchemeIs(chrome::kHttpScheme) ||
542 image_url.SchemeIs(chrome::kHttpsScheme))) {
543 return false;
544 }
545 }
546 return true;
547 }
548
549 } // namespace
550
551 // static
Parse(const unsigned char * data,size_t length,TemplateURLParser::ParameterFilter * param_filter,TemplateURL * url)552 bool TemplateURLParser::Parse(const unsigned char* data, size_t length,
553 TemplateURLParser::ParameterFilter* param_filter,
554 TemplateURL* url) {
555 DCHECK(url);
556 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to
557 // & . Unfortunately xmlSubstituteEntitiesDefault effects global state.
558 // If this becomes problematic we'll need to provide our own entity
559 // type for &, or strip out " by hand after parsing.
560 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
561 ParsingContext context(param_filter, url);
562 xmlSAXHandler sax_handler;
563 memset(&sax_handler, 0, sizeof(sax_handler));
564 sax_handler.startElement = &StartElementImpl;
565 sax_handler.endElement = &EndElementImpl;
566 sax_handler.characters = &CharactersImpl;
567 xmlSAXUserParseMemory(&sax_handler, &context,
568 reinterpret_cast<const char*>(data),
569 static_cast<int>(length));
570 xmlSubstituteEntitiesDefault(last_sub_entities_value);
571 // If the image was a data URL, use the favicon from the search URL instead.
572 // (see TODO inEndElementImpl()).
573 context.DeriveImageFromURL();
574
575 // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
576 // that use POST yet.
577 if (context.method() == ParsingContext::POST)
578 return false;
579 if (context.suggestion_method() == ParsingContext::POST)
580 url->SetSuggestionsURL("", 0, 0);
581
582 if (!url->short_name().empty() && !url->description().empty()) {
583 // So far so good, make sure the urls are http.
584 return IsLegal(url);
585 }
586 return false;
587 }
588