• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/search_engines/template_url_parser.h"
6 
7 #include <algorithm>
8 #include <map>
9 #include <vector>
10 
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/string_number_conversions.h"
14 #include "base/string_util.h"
15 #include "base/utf_string_conversions.h"
16 #include "chrome/browser/search_engines/template_url.h"
17 #include "chrome/common/url_constants.h"
18 #include "googleurl/src/gurl.h"
19 #include "libxml/parser.h"
20 #include "libxml/xmlwriter.h"
21 
22 namespace {
23 
24 //
25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26 // to that of char, the following names are all in terms of char. This avoids
27 // having to convert to wide, then do comparisons
28 
29 // Defines for element names of the OSD document:
30 static const char kURLElement[] = "Url";
31 static const char kParamElement[] = "Param";
32 static const char kShortNameElement[] = "ShortName";
33 static const char kDescriptionElement[] = "Description";
34 static const char kImageElement[] = "Image";
35 static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
36 static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
37 static const char kLanguageElement[] = "Language";
38 static const char kInputEncodingElement[] = "InputEncoding";
39 
40 // Various XML attributes used.
41 static const char kURLTypeAttribute[] = "type";
42 static const char kURLTemplateAttribute[] = "template";
43 static const char kImageTypeAttribute[] = "type";
44 static const char kImageWidthAttribute[] = "width";
45 static const char kImageHeightAttribute[] = "height";
46 static const char kURLIndexOffsetAttribute[] = "indexOffset";
47 static const char kURLPageOffsetAttribute[] = "pageOffset";
48 static const char kParamNameAttribute[] = "name";
49 static const char kParamValueAttribute[] = "value";
50 static const char kParamMethodAttribute[] = "method";
51 
52 // Mime type for search results.
53 static const char kHTMLType[] = "text/html";
54 
55 // Mime type for as you type suggestions.
56 static const char kSuggestionType[] = "application/x-suggestions+json";
57 
58 // Namespace identifier.
59 static const char kOSDNS[] = "xmlns";
60 
61 // The namespace for documents we understand.
62 static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";
63 
64 // Removes the namespace from the specified |name|, ex: os:Url -> Url.
PruneNamespace(std::string * name)65 static void PruneNamespace(std::string* name) {
66   size_t index = name->find_first_of(":");
67   if (index != std::string::npos)
68     name->erase(0, index + 1);
69 }
70 
71 //
72 // To minimize memory overhead while parsing, a SAX style parser is used.
73 // ParsingContext is used to maintain the state we're in the document
74 // while parsing.
75 class ParsingContext {
76  public:
77   // Enum of the known element types.
78   enum ElementType {
79     UNKNOWN,
80     OPEN_SEARCH_DESCRIPTION,
81     URL,
82     PARAM,
83     SHORT_NAME,
84     DESCRIPTION,
85     IMAGE,
86     LANGUAGE,
87     INPUT_ENCODING,
88   };
89 
90   enum Method {
91     GET,
92     POST
93   };
94 
95   // Key/value of a Param node.
96   typedef std::pair<std::string, std::string> Param;
97 
ParsingContext(TemplateURLParser::ParameterFilter * parameter_filter,TemplateURL * url)98   ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter,
99                  TemplateURL* url)
100       : url_(url),
101         parameter_filter_(parameter_filter),
102         method_(GET),
103         suggestion_method_(GET),
104         is_suggest_url_(false),
105         derive_image_from_url_(false) {
106     if (kElementNameToElementTypeMap == NULL)
107       InitMapping();
108   }
109 
110   // Invoked when an element starts.
PushElement(const std::string & element)111   void PushElement(const std::string& element) {
112     ElementType type;
113     if (kElementNameToElementTypeMap->find(element) ==
114         kElementNameToElementTypeMap->end()) {
115       type = UNKNOWN;
116     } else {
117       type = (*kElementNameToElementTypeMap)[element];
118     }
119     elements_.push_back(type);
120   }
121 
PopElement()122   void PopElement() {
123     elements_.pop_back();
124   }
125 
126   // Returns the current ElementType.
GetKnownType()127   ElementType GetKnownType() {
128     if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
129       return elements_[1];
130 
131     // We only expect PARAM nodes under the Url node
132     if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
133         elements_[1] == URL && elements_[2] == PARAM)
134       return PARAM;
135 
136     return UNKNOWN;
137   }
138 
template_url()139   TemplateURL* template_url() { return url_; }
140 
AddImageRef(const std::string & type,int width,int height)141   void AddImageRef(const std::string& type, int width, int height) {
142     if (width > 0 && height > 0)
143       current_image_.reset(new TemplateURL::ImageRef(type, width, height));
144   }
145 
EndImage()146   void EndImage() {
147     current_image_.reset();
148   }
149 
SetImageURL(const GURL & url)150   void SetImageURL(const GURL& url) {
151     if (current_image_.get()) {
152       current_image_->url = url;
153       url_->add_image_ref(*current_image_);
154       current_image_.reset();
155     }
156   }
157 
ResetString()158   void ResetString() {
159     string_.clear();
160   }
161 
AppendString(const string16 & string)162   void AppendString(const string16& string) {
163     string_ += string;
164   }
165 
GetString()166   const string16& GetString() {
167     return string_;
168   }
169 
ResetExtraParams()170   void ResetExtraParams() {
171     extra_params_.clear();
172   }
173 
AddExtraParams(const std::string & key,const std::string & value)174   void AddExtraParams(const std::string& key, const std::string& value) {
175     if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value))
176       return;
177     extra_params_.push_back(Param(key, value));
178   }
179 
extra_params() const180   const std::vector<Param>& extra_params() const { return extra_params_; }
181 
set_is_suggestion(bool value)182   void set_is_suggestion(bool value) { is_suggest_url_ = value; }
is_suggestion() const183   bool is_suggestion() const { return is_suggest_url_; }
184 
parameter_filter() const185   TemplateURLParser::ParameterFilter* parameter_filter() const {
186     return parameter_filter_;
187   }
188 
set_derive_image_from_url(bool derive_image_from_url)189   void set_derive_image_from_url(bool derive_image_from_url) {
190     derive_image_from_url_ = derive_image_from_url;
191   }
192 
set_method(Method method)193   void set_method(Method method) { method_ = method; }
method()194   Method method() { return method_; }
195 
set_suggestion_method(Method method)196   void set_suggestion_method(Method method) { suggestion_method_ = method; }
suggestion_method()197   Method suggestion_method() { return suggestion_method_; }
198 
199   // Builds the image URL from the Template search URL if no image URL has been
200   // set.
DeriveImageFromURL()201   void DeriveImageFromURL() {
202     if (derive_image_from_url_ &&
203         url_->GetFaviconURL().is_empty() && url_->url()) {
204       GURL url(url_->url()->url());  // More url's please...
205       url_->SetFaviconURL(TemplateURL::GenerateFaviconURL(url));
206     }
207   }
208 
209  private:
InitMapping()210   static void InitMapping() {
211     kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
212     (*kElementNameToElementTypeMap)[kURLElement] = URL;
213     (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
214     (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
215     (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION;
216     (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
217     (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
218         OPEN_SEARCH_DESCRIPTION;
219     (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
220         OPEN_SEARCH_DESCRIPTION;
221     (*kElementNameToElementTypeMap)[kLanguageElement] =
222         LANGUAGE;
223     (*kElementNameToElementTypeMap)[kInputEncodingElement] =
224         INPUT_ENCODING;
225   }
226 
227   // Key is UTF8 encoded.
228   static std::map<std::string, ElementType>* kElementNameToElementTypeMap;
229   // TemplateURL supplied to Read method. It's owned by the caller, so we
230   // don't need to free it.
231   TemplateURL* url_;
232   std::vector<ElementType> elements_;
233   scoped_ptr<TemplateURL::ImageRef> current_image_;
234 
235   // Character content for the current element.
236   string16 string_;
237 
238   TemplateURLParser::ParameterFilter* parameter_filter_;
239 
240   // The list of parameters parsed in the Param nodes of a Url node.
241   std::vector<Param> extra_params_;
242 
243   // The HTTP methods used.
244   Method method_;
245   Method suggestion_method_;
246 
247   // If true, we are currently parsing a suggest URL, otherwise it is an HTML
248   // search.  Note that we don't need a stack as Url nodes cannot be nested.
249   bool is_suggest_url_;
250 
251   // Whether we should derive the image from the URL (when images are data
252   // URLs).
253   bool derive_image_from_url_;
254 
255   DISALLOW_COPY_AND_ASSIGN(ParsingContext);
256 };
257 
258 // static
259 std::map<std::string, ParsingContext::ElementType>*
260     ParsingContext::kElementNameToElementTypeMap = NULL;
261 
XMLCharToUTF16(const xmlChar * value,int length)262 string16 XMLCharToUTF16(const xmlChar* value, int length) {
263   return UTF8ToUTF16(std::string((const char*)value, length));
264 }
265 
XMLCharToString(const xmlChar * value)266 std::string XMLCharToString(const xmlChar* value) {
267   return std::string((const char*)value);
268 }
269 
270 // Returns true if input_encoding contains a valid input encoding string. This
271 // doesn't verify that we have a valid encoding for the string, just that the
272 // string contains characters that constitute a valid input encoding.
IsValidEncodingString(const std::string & input_encoding)273 bool IsValidEncodingString(const std::string& input_encoding) {
274   if (input_encoding.empty())
275     return false;
276 
277   if (!IsAsciiAlpha(input_encoding[0]))
278     return false;
279 
280   for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
281     char c = input_encoding[i];
282     if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
283         c != '-') {
284       return false;
285     }
286   }
287   return true;
288 }
289 
ParseURL(const xmlChar ** atts,ParsingContext * context)290 void ParseURL(const xmlChar** atts, ParsingContext* context) {
291   if (!atts)
292     return;
293 
294   TemplateURL* turl = context->template_url();
295   const xmlChar** attributes = atts;
296   std::string template_url;
297   bool is_post = false;
298   bool is_html_url = false;
299   bool is_suggest_url = false;
300   int index_offset = 1;
301   int page_offset = 1;
302 
303   while (*attributes) {
304     std::string name(XMLCharToString(*attributes));
305     const xmlChar* value = attributes[1];
306     if (name == kURLTypeAttribute) {
307       std::string type = XMLCharToString(value);
308       is_html_url = (type == kHTMLType);
309       is_suggest_url = (type == kSuggestionType);
310     } else if (name == kURLTemplateAttribute) {
311       template_url = XMLCharToString(value);
312     } else if (name == kURLIndexOffsetAttribute) {
313       base::StringToInt(XMLCharToString(value), &index_offset);
314       index_offset = std::max(1, index_offset);
315     } else if (name == kURLPageOffsetAttribute) {
316       base::StringToInt(XMLCharToString(value), &page_offset);
317       page_offset = std::max(1, page_offset);
318     } else if (name == kParamMethodAttribute) {
319       is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
320     }
321     attributes += 2;
322   }
323   if (is_html_url) {
324     turl->SetURL(template_url, index_offset, page_offset);
325     context->set_is_suggestion(false);
326     if (is_post)
327       context->set_method(ParsingContext::POST);
328   } else if (is_suggest_url) {
329     turl->SetSuggestionsURL(template_url, index_offset, page_offset);
330     context->set_is_suggestion(true);
331     if (is_post)
332       context->set_suggestion_method(ParsingContext::POST);
333   }
334 }
335 
ParseImage(const xmlChar ** atts,ParsingContext * context)336 void ParseImage(const xmlChar** atts, ParsingContext* context) {
337   if (!atts)
338     return;
339 
340   const xmlChar** attributes = atts;
341   int width = 0;
342   int height = 0;
343   std::string type;
344   while (*attributes) {
345     std::string name(XMLCharToString(*attributes));
346     const xmlChar* value = attributes[1];
347     if (name == kImageTypeAttribute) {
348       type = XMLCharToString(value);
349     } else if (name == kImageWidthAttribute) {
350       base::StringToInt(XMLCharToString(value), &width);
351     } else if (name == kImageHeightAttribute) {
352       base::StringToInt(XMLCharToString(value), &height);
353     }
354     attributes += 2;
355   }
356   if (width > 0 && height > 0 && !type.empty()) {
357     // Valid Image URL.
358     context->AddImageRef(type, width, height);
359   }
360 }
361 
ParseParam(const xmlChar ** atts,ParsingContext * context)362 void ParseParam(const xmlChar** atts, ParsingContext* context) {
363   if (!atts)
364     return;
365 
366   const xmlChar** attributes = atts;
367   std::string key, value;
368   while (*attributes) {
369     std::string name(XMLCharToString(*attributes));
370     const xmlChar* val = attributes[1];
371     if (name == kParamNameAttribute) {
372       key = XMLCharToString(val);
373     } else if (name == kParamValueAttribute) {
374       value = XMLCharToString(val);
375     }
376     attributes += 2;
377   }
378   if (!key.empty())
379     context->AddExtraParams(key, value);
380 }
381 
AppendParamToQuery(const std::string & key,const std::string & value,std::string * query)382 static void AppendParamToQuery(const std::string& key,
383                                const std::string& value,
384                                std::string* query) {
385   if (!query->empty())
386     query->append("&");
387   if (!key.empty()) {
388     query->append(key);
389     query->append("=");
390   }
391   query->append(value);
392 }
393 
ProcessURLParams(ParsingContext * context)394 void ProcessURLParams(ParsingContext* context) {
395   TemplateURL* t_url = context->template_url();
396   const TemplateURLRef* t_url_ref =
397       context->is_suggestion() ? t_url->suggestions_url() :
398                                  t_url->url();
399   if (!t_url_ref)
400     return;
401 
402   if (!context->parameter_filter() && context->extra_params().empty())
403     return;
404 
405   GURL url(t_url_ref->url());
406   // If there is a parameter filter, parse the existing URL and remove any
407   // unwanted parameter.
408   TemplateURLParser::ParameterFilter* filter = context->parameter_filter();
409   std::string new_query;
410   bool modified = false;
411   if (filter) {
412     url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
413     url_parse::Component key, value;
414     const char* url_spec = url.spec().c_str();
415     while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
416       std::string key_str(url_spec, key.begin, key.len);
417       std::string value_str(url_spec, value.begin, value.len);
418       if (filter->KeepParameter(key_str, value_str)) {
419         AppendParamToQuery(key_str, value_str, &new_query);
420       } else {
421         modified = true;
422       }
423     }
424   }
425   if (!modified)
426     new_query = url.query();
427 
428   // Add the extra parameters if any.
429   const std::vector<ParsingContext::Param>& params = context->extra_params();
430   if (!params.empty()) {
431     modified = true;
432     std::vector<ParsingContext::Param>::const_iterator iter;
433     for (iter = params.begin(); iter != params.end(); ++iter)
434       AppendParamToQuery(iter->first, iter->second, &new_query);
435   }
436 
437   if (modified) {
438     GURL::Replacements repl;
439     repl.SetQueryStr(new_query);
440     url = url.ReplaceComponents(repl);
441     if (context->is_suggestion()) {
442       t_url->SetSuggestionsURL(url.spec(),
443                                t_url_ref->index_offset(),
444                                t_url_ref->page_offset());
445     } else {
446       t_url->SetURL(url.spec(),
447                     t_url_ref->index_offset(),
448                     t_url_ref->page_offset());
449     }
450   }
451 }
452 
StartElementImpl(void * ctx,const xmlChar * name,const xmlChar ** atts)453 void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) {
454   ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
455   std::string node_name((const char*)name);
456   PruneNamespace(&node_name);
457   context->PushElement(node_name);
458   switch (context->GetKnownType()) {
459     case ParsingContext::URL:
460       context->ResetExtraParams();
461       ParseURL(atts, context);
462       break;
463     case ParsingContext::IMAGE:
464       ParseImage(atts, context);
465       break;
466     case ParsingContext::PARAM:
467       ParseParam(atts, context);
468       break;
469     default:
470       break;
471   }
472   context->ResetString();
473 }
474 
EndElementImpl(void * ctx,const xmlChar * name)475 void EndElementImpl(void *ctx, const xmlChar *name) {
476   ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
477   switch (context->GetKnownType()) {
478     case ParsingContext::SHORT_NAME:
479       context->template_url()->set_short_name(context->GetString());
480       break;
481     case ParsingContext::DESCRIPTION:
482       context->template_url()->set_description(context->GetString());
483       break;
484     case ParsingContext::IMAGE: {
485       GURL image_url(UTF16ToUTF8(context->GetString()));
486       if (image_url.SchemeIs(chrome::kDataScheme)) {
487         // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
488         // decode the data URL in the renderer. For now, we'll just point to the
489         // favicon from the URL.
490         context->set_derive_image_from_url(true);
491       } else {
492         context->SetImageURL(image_url);
493       }
494       context->EndImage();
495       break;
496     }
497     case ParsingContext::LANGUAGE:
498       context->template_url()->add_language(context->GetString());
499       break;
500     case ParsingContext::INPUT_ENCODING: {
501       std::string input_encoding = UTF16ToASCII(context->GetString());
502       if (IsValidEncodingString(input_encoding))
503         context->template_url()->add_input_encoding(input_encoding);
504       break;
505     }
506     case ParsingContext::URL:
507       ProcessURLParams(context);
508       break;
509     default:
510       break;
511   }
512   context->ResetString();
513   context->PopElement();
514 }
515 
CharactersImpl(void * ctx,const xmlChar * ch,int len)516 void CharactersImpl(void *ctx, const xmlChar *ch, int len) {
517   ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
518   context->AppendString(XMLCharToUTF16(ch, len));
519 }
520 
521 // Returns true if the ref is null, or the url wrapped by ref is
522 // valid with a spec of http/https.
IsHTTPRef(const TemplateURLRef * ref)523 bool IsHTTPRef(const TemplateURLRef* ref) {
524   if (ref == NULL)
525     return true;
526   GURL url(ref->url());
527   return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) ||
528                              url.SchemeIs(chrome::kHttpsScheme)));
529 }
530 
531 // Returns true if the TemplateURL is legal. A legal TemplateURL is one
532 // where all URLs have a spec of http/https.
IsLegal(TemplateURL * url)533 bool IsLegal(TemplateURL* url) {
534   if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url()))
535     return false;
536   // Make sure all the image refs are legal.
537   const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs();
538   for (size_t i = 0; i < image_refs.size(); i++) {
539     GURL image_url(image_refs[i].url);
540     if (!image_url.is_valid() ||
541         !(image_url.SchemeIs(chrome::kHttpScheme) ||
542           image_url.SchemeIs(chrome::kHttpsScheme))) {
543       return false;
544     }
545   }
546   return true;
547 }
548 
549 }  // namespace
550 
551 // static
Parse(const unsigned char * data,size_t length,TemplateURLParser::ParameterFilter * param_filter,TemplateURL * url)552 bool TemplateURLParser::Parse(const unsigned char* data, size_t length,
553                               TemplateURLParser::ParameterFilter* param_filter,
554                               TemplateURL* url) {
555   DCHECK(url);
556   // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
557   // &#38; . Unfortunately xmlSubstituteEntitiesDefault effects global state.
558   // If this becomes problematic we'll need to provide our own entity
559   // type for &amp;, or strip out &#34; by hand after parsing.
560   int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
561   ParsingContext context(param_filter, url);
562   xmlSAXHandler sax_handler;
563   memset(&sax_handler, 0, sizeof(sax_handler));
564   sax_handler.startElement = &StartElementImpl;
565   sax_handler.endElement = &EndElementImpl;
566   sax_handler.characters = &CharactersImpl;
567   xmlSAXUserParseMemory(&sax_handler, &context,
568                         reinterpret_cast<const char*>(data),
569                         static_cast<int>(length));
570   xmlSubstituteEntitiesDefault(last_sub_entities_value);
571   // If the image was a data URL, use the favicon from the search URL instead.
572   // (see TODO inEndElementImpl()).
573   context.DeriveImageFromURL();
574 
575   // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
576   //                that use POST yet.
577   if (context.method() == ParsingContext::POST)
578     return false;
579   if (context.suggestion_method() == ParsingContext::POST)
580     url->SetSuggestionsURL("", 0, 0);
581 
582   if (!url->short_name().empty() && !url->description().empty()) {
583     // So far so good, make sure the urls are http.
584     return IsLegal(url);
585   }
586   return false;
587 }
588