• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/search_engines/template_url_parser.h"
6 
7 #include <algorithm>
8 #include <map>
9 #include <vector>
10 
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "chrome/browser/search_engines/template_url.h"
17 #include "chrome/browser/search_engines/ui_thread_search_terms_data.h"
18 #include "libxml/parser.h"
19 #include "libxml/xmlwriter.h"
20 #include "ui/gfx/favicon_size.h"
21 #include "url/gurl.h"
22 #include "url/url_constants.h"
23 
24 namespace {
25 
26 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
27 // to that of char, the following names are all in terms of char. This avoids
28 // having to convert to wide, then do comparisons.
29 
30 // Defines for element names of the OSD document:
31 const char kURLElement[] = "Url";
32 const char kParamElement[] = "Param";
33 const char kShortNameElement[] = "ShortName";
34 const char kImageElement[] = "Image";
35 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
36 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
37 const char kInputEncodingElement[] = "InputEncoding";
38 
39 // Various XML attributes used.
40 const char kURLTypeAttribute[] = "type";
41 const char kURLTemplateAttribute[] = "template";
42 const char kImageTypeAttribute[] = "type";
43 const char kImageWidthAttribute[] = "width";
44 const char kImageHeightAttribute[] = "height";
45 const char kParamNameAttribute[] = "name";
46 const char kParamValueAttribute[] = "value";
47 const char kParamMethodAttribute[] = "method";
48 
49 // Mime type for search results.
50 const char kHTMLType[] = "text/html";
51 
52 // Mime type for as you type suggestions.
53 const char kSuggestionType[] = "application/x-suggestions+json";
54 
XMLCharToString(const xmlChar * value)55 std::string XMLCharToString(const xmlChar* value) {
56   return std::string(reinterpret_cast<const char*>(value));
57 }
58 
59 // Returns true if input_encoding contains a valid input encoding string. This
60 // doesn't verify that we have a valid encoding for the string, just that the
61 // string contains characters that constitute a valid input encoding.
IsValidEncodingString(const std::string & input_encoding)62 bool IsValidEncodingString(const std::string& input_encoding) {
63   if (input_encoding.empty())
64     return false;
65 
66   if (!IsAsciiAlpha(input_encoding[0]))
67     return false;
68 
69   for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
70     char c = input_encoding[i];
71     if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
72         c != '-') {
73       return false;
74     }
75   }
76   return true;
77 }
78 
AppendParamToQuery(const std::string & key,const std::string & value,std::string * query)79 void AppendParamToQuery(const std::string& key,
80                         const std::string& value,
81                         std::string* query) {
82   if (!query->empty())
83     query->append("&");
84   if (!key.empty()) {
85     query->append(key);
86     query->append("=");
87   }
88   query->append(value);
89 }
90 
91 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
IsHTTPRef(const std::string & url)92 bool IsHTTPRef(const std::string& url) {
93   if (url.empty())
94     return true;
95   GURL gurl(url);
96   return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
97                              gurl.SchemeIs(url::kHttpsScheme));
98 }
99 
100 }  // namespace
101 
102 
103 // TemplateURLParsingContext --------------------------------------------------
104 
105 // To minimize memory overhead while parsing, a SAX style parser is used.
106 // TemplateURLParsingContext is used to maintain the state we're in the document
107 // while parsing.
108 class TemplateURLParsingContext {
109  public:
110   // Enum of the known element types.
111   enum ElementType {
112     UNKNOWN,
113     OPEN_SEARCH_DESCRIPTION,
114     URL,
115     PARAM,
116     SHORT_NAME,
117     IMAGE,
118     INPUT_ENCODING,
119   };
120 
121   enum Method {
122     GET,
123     POST
124   };
125 
126   // Key/value of a Param node.
127   typedef std::pair<std::string, std::string> Param;
128 
129   explicit TemplateURLParsingContext(
130       TemplateURLParser::ParameterFilter* parameter_filter);
131 
132   static void StartElementImpl(void* ctx,
133                                const xmlChar* name,
134                                const xmlChar** atts);
135   static void EndElementImpl(void* ctx, const xmlChar* name);
136   static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
137 
138   // Returns a heap-allocated TemplateURL representing the result of parsing.
139   // This will be NULL if parsing failed or if the results were invalid for some
140   // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
141   // a resulting TemplateURLRef was invalid, etc.).
142   TemplateURL* GetTemplateURL(Profile* profile, bool show_in_default_list);
143 
144  private:
145   // Key is UTF8 encoded.
146   typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
147 
148   static void InitMapping();
149 
150   void ParseURL(const xmlChar** atts);
151   void ParseImage(const xmlChar** atts);
152   void ParseParam(const xmlChar** atts);
153   void ProcessURLParams();
154 
155   // Returns the current ElementType.
156   ElementType GetKnownType();
157 
158   static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
159 
160   // Data that gets updated as we parse, and is converted to a TemplateURL by
161   // GetTemplateURL().
162   TemplateURLData data_;
163 
164   std::vector<ElementType> elements_;
165   bool image_is_valid_for_favicon_;
166 
167   // Character content for the current element.
168   base::string16 string_;
169 
170   TemplateURLParser::ParameterFilter* parameter_filter_;
171 
172   // The list of parameters parsed in the Param nodes of a Url node.
173   std::vector<Param> extra_params_;
174 
175   // The HTTP methods used.
176   Method method_;
177   Method suggestion_method_;
178 
179   // If true, we are currently parsing a suggest URL, otherwise it is an HTML
180   // search.  Note that we don't need a stack as URL nodes cannot be nested.
181   bool is_suggest_url_;
182 
183   // Whether we should derive the image from the URL (when images are data
184   // URLs).
185   bool derive_image_from_url_;
186 
187   DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
188 };
189 
190 // static
191 TemplateURLParsingContext::ElementNameToElementTypeMap*
192     TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
193 
TemplateURLParsingContext(TemplateURLParser::ParameterFilter * parameter_filter)194 TemplateURLParsingContext::TemplateURLParsingContext(
195     TemplateURLParser::ParameterFilter* parameter_filter)
196     : image_is_valid_for_favicon_(false),
197       parameter_filter_(parameter_filter),
198       method_(GET),
199       suggestion_method_(GET),
200       is_suggest_url_(false),
201       derive_image_from_url_(false) {
202   if (kElementNameToElementTypeMap == NULL)
203     InitMapping();
204 }
205 
206 // static
StartElementImpl(void * ctx,const xmlChar * name,const xmlChar ** atts)207 void TemplateURLParsingContext::StartElementImpl(void* ctx,
208                                                  const xmlChar* name,
209                                                  const xmlChar** atts) {
210   // Remove the namespace from |name|, ex: os:Url -> Url.
211   std::string node_name(XMLCharToString(name));
212   size_t index = node_name.find_first_of(":");
213   if (index != std::string::npos)
214     node_name.erase(0, index + 1);
215 
216   TemplateURLParsingContext* context =
217       reinterpret_cast<TemplateURLParsingContext*>(ctx);
218   context->elements_.push_back(
219     context->kElementNameToElementTypeMap->count(node_name) ?
220         (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
221   switch (context->GetKnownType()) {
222     case TemplateURLParsingContext::URL:
223       context->extra_params_.clear();
224       context->ParseURL(atts);
225       break;
226     case TemplateURLParsingContext::IMAGE:
227       context->ParseImage(atts);
228       break;
229     case TemplateURLParsingContext::PARAM:
230       context->ParseParam(atts);
231       break;
232     default:
233       break;
234   }
235   context->string_.clear();
236 }
237 
238 // static
EndElementImpl(void * ctx,const xmlChar * name)239 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
240   TemplateURLParsingContext* context =
241       reinterpret_cast<TemplateURLParsingContext*>(ctx);
242   switch (context->GetKnownType()) {
243     case TemplateURLParsingContext::SHORT_NAME:
244       context->data_.short_name = context->string_;
245       break;
246     case TemplateURLParsingContext::IMAGE: {
247       GURL image_url(base::UTF16ToUTF8(context->string_));
248       if (image_url.SchemeIs(url::kDataScheme)) {
249         // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
250         // decode the data URL in the renderer. For now, we'll just point to the
251         // favicon from the URL.
252         context->derive_image_from_url_ = true;
253       } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
254                  (image_url.SchemeIs(url::kHttpScheme) ||
255                   image_url.SchemeIs(url::kHttpsScheme))) {
256         context->data_.favicon_url = image_url;
257       }
258       context->image_is_valid_for_favicon_ = false;
259       break;
260     }
261     case TemplateURLParsingContext::INPUT_ENCODING: {
262       std::string input_encoding = base::UTF16ToASCII(context->string_);
263       if (IsValidEncodingString(input_encoding))
264         context->data_.input_encodings.push_back(input_encoding);
265       break;
266     }
267     case TemplateURLParsingContext::URL:
268       context->ProcessURLParams();
269       break;
270     default:
271       break;
272   }
273   context->string_.clear();
274   context->elements_.pop_back();
275 }
276 
277 // static
CharactersImpl(void * ctx,const xmlChar * ch,int len)278 void TemplateURLParsingContext::CharactersImpl(void* ctx,
279                                                const xmlChar* ch,
280                                                int len) {
281   reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
282       base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
283 }
284 
GetTemplateURL(Profile * profile,bool show_in_default_list)285 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
286     Profile* profile,
287     bool show_in_default_list) {
288   // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
289   if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
290       !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
291     return NULL;
292   if (suggestion_method_ == TemplateURLParsingContext::POST)
293     data_.suggestions_url.clear();
294 
295   // If the image was a data URL, use the favicon from the search URL instead.
296   // (see the TODO in EndElementImpl()).
297   GURL search_url(data_.url());
298   if (derive_image_from_url_ && data_.favicon_url.is_empty())
299     data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
300 
301   data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
302   data_.show_in_default_list = show_in_default_list;
303 
304   // Bail if the search URL is empty or if either TemplateURLRef is invalid.
305   scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
306   scoped_ptr<SearchTermsData> search_terms_data(profile ?
307       new UIThreadSearchTermsData(profile) : new SearchTermsData());
308   if (template_url->url().empty() ||
309       !template_url->url_ref().IsValid(*search_terms_data) ||
310       (!template_url->suggestions_url().empty() &&
311        !template_url->suggestions_url_ref().IsValid(*search_terms_data))) {
312     return NULL;
313   }
314 
315   return template_url.release();
316 }
317 
318 // static
InitMapping()319 void TemplateURLParsingContext::InitMapping() {
320   kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
321   (*kElementNameToElementTypeMap)[kURLElement] = URL;
322   (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
323   (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
324   (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
325   (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
326       OPEN_SEARCH_DESCRIPTION;
327   (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
328       OPEN_SEARCH_DESCRIPTION;
329   (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
330 }
331 
ParseURL(const xmlChar ** atts)332 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
333   if (!atts)
334     return;
335 
336   std::string template_url;
337   bool is_post = false;
338   bool is_html_url = false;
339   bool is_suggest_url = false;
340   for (; *atts; atts += 2) {
341     std::string name(XMLCharToString(*atts));
342     const xmlChar* value = atts[1];
343     if (name == kURLTypeAttribute) {
344       std::string type = XMLCharToString(value);
345       is_html_url = (type == kHTMLType);
346       is_suggest_url = (type == kSuggestionType);
347     } else if (name == kURLTemplateAttribute) {
348       template_url = XMLCharToString(value);
349     } else if (name == kParamMethodAttribute) {
350       is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
351     }
352   }
353 
354   if (is_html_url && !template_url.empty()) {
355     data_.SetURL(template_url);
356     is_suggest_url_ = false;
357     if (is_post)
358       method_ = POST;
359   } else if (is_suggest_url) {
360     data_.suggestions_url = template_url;
361     is_suggest_url_ = true;
362     if (is_post)
363       suggestion_method_ = POST;
364   }
365 }
366 
ParseImage(const xmlChar ** atts)367 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
368   if (!atts)
369     return;
370 
371   int width = 0;
372   int height = 0;
373   std::string type;
374   for (; *atts; atts += 2) {
375     std::string name(XMLCharToString(*atts));
376     const xmlChar* value = atts[1];
377     if (name == kImageTypeAttribute) {
378       type = XMLCharToString(value);
379     } else if (name == kImageWidthAttribute) {
380       base::StringToInt(XMLCharToString(value), &width);
381     } else if (name == kImageHeightAttribute) {
382       base::StringToInt(XMLCharToString(value), &height);
383     }
384   }
385 
386   image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
387       (height == gfx::kFaviconSize) &&
388       ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
389 }
390 
ParseParam(const xmlChar ** atts)391 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
392   if (!atts)
393     return;
394 
395   std::string key, value;
396   for (; *atts; atts += 2) {
397     std::string name(XMLCharToString(*atts));
398     const xmlChar* val = atts[1];
399     if (name == kParamNameAttribute) {
400       key = XMLCharToString(val);
401     } else if (name == kParamValueAttribute) {
402       value = XMLCharToString(val);
403     }
404   }
405 
406   if (!key.empty() &&
407       (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
408     extra_params_.push_back(Param(key, value));
409 }
410 
ProcessURLParams()411 void TemplateURLParsingContext::ProcessURLParams() {
412   if (!parameter_filter_ && extra_params_.empty())
413     return;
414 
415   GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
416   if (url.is_empty())
417     return;
418 
419   // If there is a parameter filter, parse the existing URL and remove any
420   // unwanted parameter.
421   std::string new_query;
422   bool modified = false;
423   if (parameter_filter_) {
424     url::Component query = url.parsed_for_possibly_invalid_spec().query;
425     url::Component key, value;
426     const char* url_spec = url.spec().c_str();
427     while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
428       std::string key_str(url_spec, key.begin, key.len);
429       std::string value_str(url_spec, value.begin, value.len);
430       if (parameter_filter_->KeepParameter(key_str, value_str)) {
431         AppendParamToQuery(key_str, value_str, &new_query);
432       } else {
433         modified = true;
434       }
435     }
436   }
437   if (!modified)
438     new_query = url.query();
439 
440   // Add the extra parameters if any.
441   if (!extra_params_.empty()) {
442     modified = true;
443     for (std::vector<Param>::const_iterator iter(extra_params_.begin());
444          iter != extra_params_.end(); ++iter)
445       AppendParamToQuery(iter->first, iter->second, &new_query);
446   }
447 
448   if (modified) {
449     GURL::Replacements repl;
450     repl.SetQueryStr(new_query);
451     url = url.ReplaceComponents(repl);
452     if (is_suggest_url_)
453       data_.suggestions_url = url.spec();
454     else if (url.is_valid())
455       data_.SetURL(url.spec());
456   }
457 }
458 
459 TemplateURLParsingContext::ElementType
GetKnownType()460     TemplateURLParsingContext::GetKnownType() {
461   if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
462     return elements_[1];
463   // We only expect PARAM nodes under the URL node.
464   return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
465       elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
466 }
467 
468 
469 // TemplateURLParser ----------------------------------------------------------
470 
471 // static
Parse(Profile * profile,bool show_in_default_list,const char * data,size_t length,TemplateURLParser::ParameterFilter * param_filter)472 TemplateURL* TemplateURLParser::Parse(
473     Profile* profile,
474     bool show_in_default_list,
475     const char* data,
476     size_t length,
477     TemplateURLParser::ParameterFilter* param_filter) {
478   // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
479   // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
480   // If this becomes problematic we'll need to provide our own entity
481   // type for &amp;, or strip out &#38; by hand after parsing.
482   int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
483   TemplateURLParsingContext context(param_filter);
484   xmlSAXHandler sax_handler;
485   memset(&sax_handler, 0, sizeof(sax_handler));
486   sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
487   sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
488   sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
489   int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
490                                     static_cast<int>(length));
491   xmlSubstituteEntitiesDefault(last_sub_entities_value);
492 
493   return error ? NULL : context.GetTemplateURL(profile, show_in_default_list);
494 }
495