• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Canonicalizer functions for working with and resolving relative URLs.
6 
7 #include "base/logging.h"
8 #include "url/url_canon.h"
9 #include "url/url_canon_internal.h"
10 #include "url/url_constants.h"
11 #include "url/url_file.h"
12 #include "url/url_parse_internal.h"
13 #include "url/url_util_internal.h"
14 
15 namespace url {
16 
17 namespace {
18 
19 // Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
20 // 379034), whereas IE is case-insensetive.
21 //
22 // We choose to be more permissive like IE. We don't need to worry about
23 // unescaping or anything here: neither IE or Firefox allow this. We also
24 // don't have to worry about invalid scheme characters since we are comparing
25 // against the canonical scheme of the base.
26 //
27 // The base URL should always be canonical, therefore is ASCII.
28 template<typename CHAR>
AreSchemesEqual(const char * base,const Component & base_scheme,const CHAR * cmp,const Component & cmp_scheme)29 bool AreSchemesEqual(const char* base,
30                      const Component& base_scheme,
31                      const CHAR* cmp,
32                      const Component& cmp_scheme) {
33   if (base_scheme.len != cmp_scheme.len)
34     return false;
35   for (int i = 0; i < base_scheme.len; i++) {
36     // We assume the base is already canonical, so we don't have to
37     // canonicalize it.
38     if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) !=
39         base[base_scheme.begin + i])
40       return false;
41   }
42   return true;
43 }
44 
45 #ifdef WIN32
46 
47 // Here, we also allow Windows paths to be represented as "/C:/" so we can be
48 // consistent about URL paths beginning with slashes. This function is like
49 // DoesBeginWindowsDrivePath except that it also requires a slash at the
50 // beginning.
51 template<typename CHAR>
DoesBeginSlashWindowsDriveSpec(const CHAR * spec,int start_offset,int spec_len)52 bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset,
53                                     int spec_len) {
54   if (start_offset >= spec_len)
55     return false;
56   return IsURLSlash(spec[start_offset]) &&
57          DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len);
58 }
59 
60 #endif  // WIN32
61 
62 // See IsRelativeURL in the header file for usage.
63 template<typename CHAR>
DoIsRelativeURL(const char * base,const Parsed & base_parsed,const CHAR * url,int url_len,bool is_base_hierarchical,bool * is_relative,Component * relative_component)64 bool DoIsRelativeURL(const char* base,
65                      const Parsed& base_parsed,
66                      const CHAR* url,
67                      int url_len,
68                      bool is_base_hierarchical,
69                      bool* is_relative,
70                      Component* relative_component) {
71   *is_relative = false;  // So we can default later to not relative.
72 
73   // Trim whitespace and construct a new range for the substring.
74   int begin = 0;
75   TrimURL(url, &begin, &url_len);
76   if (begin >= url_len) {
77     // Empty URLs are relative, but do nothing.
78     *relative_component = Component(begin, 0);
79     *is_relative = true;
80     return true;
81   }
82 
83 #ifdef WIN32
84   // We special case paths like "C:\foo" so they can link directly to the
85   // file on Windows (IE compatability). The security domain stuff should
86   // prevent a link like this from actually being followed if its on a
87   // web page.
88   //
89   // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
90   // as relative, as this will just replace the path when the base scheme
91   // is a file and the answer will still be correct.
92   //
93   // We require strict backslashes when detecting UNC since two forward
94   // shashes should be treated a a relative URL with a hostname.
95   if (DoesBeginWindowsDriveSpec(url, begin, url_len) ||
96       DoesBeginUNCPath(url, begin, url_len, true))
97     return true;
98 #endif  // WIN32
99 
100   // See if we've got a scheme, if not, we know this is a relative URL.
101   // BUT: Just because we have a scheme, doesn't make it absolute.
102   // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
103   // empty, we treat it as relative (":foo") like IE does.
104   Component scheme;
105   const bool scheme_is_empty =
106       !ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
107   if (scheme_is_empty) {
108     if (url[begin] == '#') {
109       // |url| is a bare fragement (e.g. "#foo"). This can be resolved against
110       // any base. Fall-through.
111     } else if (!is_base_hierarchical) {
112       // Don't allow relative URLs if the base scheme doesn't support it.
113       return false;
114     }
115 
116     *relative_component = MakeRange(begin, url_len);
117     *is_relative = true;
118     return true;
119   }
120 
121   // If the scheme isn't valid, then it's relative.
122   int scheme_end = scheme.end();
123   for (int i = scheme.begin; i < scheme_end; i++) {
124     if (!CanonicalSchemeChar(url[i])) {
125       if (!is_base_hierarchical) {
126         // Don't allow relative URLs if the base scheme doesn't support it.
127         return false;
128       }
129       *relative_component = MakeRange(begin, url_len);
130       *is_relative = true;
131       return true;
132     }
133   }
134 
135   // If the scheme is not the same, then we can't count it as relative.
136   if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
137     return true;
138 
139   // When the scheme that they both share is not hierarchical, treat the
140   // incoming scheme as absolute (this way with the base of "data:foo",
141   // "data:bar" will be reported as absolute.
142   if (!is_base_hierarchical)
143     return true;
144 
145   int colon_offset = scheme.end();
146 
147   // If it's a filesystem URL, the only valid way to make it relative is not to
148   // supply a scheme.  There's no equivalent to e.g. http:index.html.
149   if (CompareSchemeComponent(url, scheme, kFileSystemScheme))
150     return true;
151 
152   // ExtractScheme guarantees that the colon immediately follows what it
153   // considers to be the scheme. CountConsecutiveSlashes will handle the
154   // case where the begin offset is the end of the input.
155   int num_slashes = CountConsecutiveSlashes(url, colon_offset + 1, url_len);
156 
157   if (num_slashes == 0 || num_slashes == 1) {
158     // No slashes means it's a relative path like "http:foo.html". One slash
159     // is an absolute path. "http:/home/foo.html"
160     *is_relative = true;
161     *relative_component = MakeRange(colon_offset + 1, url_len);
162     return true;
163   }
164 
165   // Two or more slashes after the scheme we treat as absolute.
166   return true;
167 }
168 
169 // Copies all characters in the range [begin, end) of |spec| to the output,
170 // up until and including the last slash. There should be a slash in the
171 // range, if not, nothing will be copied.
172 //
173 // The input is assumed to be canonical, so we search only for exact slashes
174 // and not backslashes as well. We also know that it's ASCII.
CopyToLastSlash(const char * spec,int begin,int end,CanonOutput * output)175 void CopyToLastSlash(const char* spec,
176                      int begin,
177                      int end,
178                      CanonOutput* output) {
179   // Find the last slash.
180   int last_slash = -1;
181   for (int i = end - 1; i >= begin; i--) {
182     if (spec[i] == '/') {
183       last_slash = i;
184       break;
185     }
186   }
187   if (last_slash < 0)
188     return;  // No slash.
189 
190   // Copy.
191   for (int i = begin; i <= last_slash; i++)
192     output->push_back(spec[i]);
193 }
194 
195 // Copies a single component from the source to the output. This is used
196 // when resolving relative URLs and a given component is unchanged. Since the
197 // source should already be canonical, we don't have to do anything special,
198 // and the input is ASCII.
CopyOneComponent(const char * source,const Component & source_component,CanonOutput * output,Component * output_component)199 void CopyOneComponent(const char* source,
200                       const Component& source_component,
201                       CanonOutput* output,
202                       Component* output_component) {
203   if (source_component.len < 0) {
204     // This component is not present.
205     *output_component = Component();
206     return;
207   }
208 
209   output_component->begin = output->length();
210   int source_end = source_component.end();
211   for (int i = source_component.begin; i < source_end; i++)
212     output->push_back(source[i]);
213   output_component->len = output->length() - output_component->begin;
214 }
215 
216 #ifdef WIN32
217 
218 // Called on Windows when the base URL is a file URL, this will copy the "C:"
219 // to the output, if there is a drive letter and if that drive letter is not
220 // being overridden by the relative URL. Otherwise, do nothing.
221 //
222 // It will return the index of the beginning of the next character in the
223 // base to be processed: if there is a "C:", the slash after it, or if
224 // there is no drive letter, the slash at the beginning of the path, or
225 // the end of the base. This can be used as the starting offset for further
226 // path processing.
227 template<typename CHAR>
CopyBaseDriveSpecIfNecessary(const char * base_url,int base_path_begin,int base_path_end,const CHAR * relative_url,int path_start,int relative_url_len,CanonOutput * output)228 int CopyBaseDriveSpecIfNecessary(const char* base_url,
229                                  int base_path_begin,
230                                  int base_path_end,
231                                  const CHAR* relative_url,
232                                  int path_start,
233                                  int relative_url_len,
234                                  CanonOutput* output) {
235   if (base_path_begin >= base_path_end)
236     return base_path_begin;  // No path.
237 
238   // If the relative begins with a drive spec, don't do anything. The existing
239   // drive spec in the base will be replaced.
240   if (DoesBeginWindowsDriveSpec(relative_url, path_start, relative_url_len)) {
241     return base_path_begin;  // Relative URL path is "C:/foo"
242   }
243 
244   // The path should begin with a slash (as all canonical paths do). We check
245   // if it is followed by a drive letter and copy it.
246   if (DoesBeginSlashWindowsDriveSpec(base_url,
247                                      base_path_begin,
248                                      base_path_end)) {
249     // Copy the two-character drive spec to the output. It will now look like
250     // "file:///C:" so the rest of it can be treated like a standard path.
251     output->push_back('/');
252     output->push_back(base_url[base_path_begin + 1]);
253     output->push_back(base_url[base_path_begin + 2]);
254     return base_path_begin + 3;
255   }
256 
257   return base_path_begin;
258 }
259 
260 #endif  // WIN32
261 
262 // A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
263 // the input is a relative path or less (qyuery or ref).
264 template<typename CHAR>
DoResolveRelativePath(const char * base_url,const Parsed & base_parsed,bool base_is_file,const CHAR * relative_url,const Component & relative_component,CharsetConverter * query_converter,CanonOutput * output,Parsed * out_parsed)265 bool DoResolveRelativePath(const char* base_url,
266                            const Parsed& base_parsed,
267                            bool base_is_file,
268                            const CHAR* relative_url,
269                            const Component& relative_component,
270                            CharsetConverter* query_converter,
271                            CanonOutput* output,
272                            Parsed* out_parsed) {
273   bool success = true;
274 
275   // We know the authority section didn't change, copy it to the output. We
276   // also know we have a path so can copy up to there.
277   Component path, query, ref;
278   ParsePathInternal(relative_url, relative_component, &path, &query, &ref);
279   // Canonical URLs always have a path, so we can use that offset.
280   output->Append(base_url, base_parsed.path.begin);
281 
282   if (path.len > 0) {
283     // The path is replaced or modified.
284     int true_path_begin = output->length();
285 
286     // For file: URLs on Windows, we don't want to treat the drive letter and
287     // colon as part of the path for relative file resolution when the
288     // incoming URL does not provide a drive spec. We save the true path
289     // beginning so we can fix it up after we are done.
290     int base_path_begin = base_parsed.path.begin;
291 #ifdef WIN32
292     if (base_is_file) {
293       base_path_begin = CopyBaseDriveSpecIfNecessary(
294           base_url, base_parsed.path.begin, base_parsed.path.end(),
295           relative_url, relative_component.begin, relative_component.end(),
296           output);
297       // Now the output looks like either "file://" or "file:///C:"
298       // and we can start appending the rest of the path. |base_path_begin|
299       // points to the character in the base that comes next.
300     }
301 #endif  // WIN32
302 
303     if (IsURLSlash(relative_url[path.begin])) {
304       // Easy case: the path is an absolute path on the server, so we can
305       // just replace everything from the path on with the new versions.
306       // Since the input should be canonical hierarchical URL, we should
307       // always have a path.
308       success &= CanonicalizePath(relative_url, path,
309                                   output, &out_parsed->path);
310     } else {
311       // Relative path, replace the query, and reference. We take the
312       // original path with the file part stripped, and append the new path.
313       // The canonicalizer will take care of resolving ".." and "."
314       int path_begin = output->length();
315       CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(),
316                       output);
317       success &= CanonicalizePartialPath(relative_url, path, path_begin,
318                                          output);
319       out_parsed->path = MakeRange(path_begin, output->length());
320 
321       // Copy the rest of the stuff after the path from the relative path.
322     }
323 
324     // Finish with the query and reference part (these can't fail).
325     CanonicalizeQuery(relative_url, query, query_converter,
326                       output, &out_parsed->query);
327     CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
328 
329     // Fix the path beginning to add back the "C:" we may have written above.
330     out_parsed->path = MakeRange(true_path_begin, out_parsed->path.end());
331     return success;
332   }
333 
334   // If we get here, the path is unchanged: copy to output.
335   CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path);
336 
337   if (query.is_valid()) {
338     // Just the query specified, replace the query and reference (ignore
339     // failures for refs)
340     CanonicalizeQuery(relative_url, query, query_converter,
341                       output, &out_parsed->query);
342     CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
343     return success;
344   }
345 
346   // If we get here, the query is unchanged: copy to output. Note that the
347   // range of the query parameter doesn't include the question mark, so we
348   // have to add it manually if there is a component.
349   if (base_parsed.query.is_valid())
350     output->push_back('?');
351   CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query);
352 
353   if (ref.is_valid()) {
354     // Just the reference specified: replace it (ignoring failures).
355     CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
356     return success;
357   }
358 
359   // We should always have something to do in this function, the caller checks
360   // that some component is being replaced.
361   DCHECK(false) << "Not reached";
362   return success;
363 }
364 
365 // Resolves a relative URL that contains a host. Typically, these will
366 // be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
367 // should be kept from the original URL is the scheme.
368 template<typename CHAR>
DoResolveRelativeHost(const char * base_url,const Parsed & base_parsed,const CHAR * relative_url,const Component & relative_component,CharsetConverter * query_converter,CanonOutput * output,Parsed * out_parsed)369 bool DoResolveRelativeHost(const char* base_url,
370                            const Parsed& base_parsed,
371                            const CHAR* relative_url,
372                            const Component& relative_component,
373                            CharsetConverter* query_converter,
374                            CanonOutput* output,
375                            Parsed* out_parsed) {
376   // Parse the relative URL, just like we would for anything following a
377   // scheme.
378   Parsed relative_parsed;  // Everything but the scheme is valid.
379   ParseAfterScheme(relative_url, relative_component.end(),
380                    relative_component.begin, &relative_parsed);
381 
382   // Now we can just use the replacement function to replace all the necessary
383   // parts of the old URL with the new one.
384   Replacements<CHAR> replacements;
385   replacements.SetUsername(relative_url, relative_parsed.username);
386   replacements.SetPassword(relative_url, relative_parsed.password);
387   replacements.SetHost(relative_url, relative_parsed.host);
388   replacements.SetPort(relative_url, relative_parsed.port);
389   replacements.SetPath(relative_url, relative_parsed.path);
390   replacements.SetQuery(relative_url, relative_parsed.query);
391   replacements.SetRef(relative_url, relative_parsed.ref);
392 
393   return ReplaceStandardURL(base_url, base_parsed, replacements,
394                             query_converter, output, out_parsed);
395 }
396 
397 // Resolves a relative URL that happens to be an absolute file path.  Examples
398 // include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
399 template<typename CHAR>
DoResolveAbsoluteFile(const CHAR * relative_url,const Component & relative_component,CharsetConverter * query_converter,CanonOutput * output,Parsed * out_parsed)400 bool DoResolveAbsoluteFile(const CHAR* relative_url,
401                            const Component& relative_component,
402                            CharsetConverter* query_converter,
403                            CanonOutput* output,
404                            Parsed* out_parsed) {
405   // Parse the file URL. The file URl parsing function uses the same logic
406   // as we do for determining if the file is absolute, in which case it will
407   // not bother to look for a scheme.
408   Parsed relative_parsed;
409   ParseFileURL(&relative_url[relative_component.begin], relative_component.len,
410                &relative_parsed);
411 
412   return CanonicalizeFileURL(&relative_url[relative_component.begin],
413                              relative_component.len, relative_parsed,
414                              query_converter, output, out_parsed);
415 }
416 
417 // TODO(brettw) treat two slashes as root like Mozilla for FTP?
418 template<typename CHAR>
DoResolveRelativeURL(const char * base_url,const Parsed & base_parsed,bool base_is_file,const CHAR * relative_url,const Component & relative_component,CharsetConverter * query_converter,CanonOutput * output,Parsed * out_parsed)419 bool DoResolveRelativeURL(const char* base_url,
420                           const Parsed& base_parsed,
421                           bool base_is_file,
422                           const CHAR* relative_url,
423                           const Component& relative_component,
424                           CharsetConverter* query_converter,
425                           CanonOutput* output,
426                           Parsed* out_parsed) {
427   // Starting point for our output parsed. We'll fix what we change.
428   *out_parsed = base_parsed;
429 
430   // Sanity check: the input should have a host or we'll break badly below.
431   // We can only resolve relative URLs with base URLs that have hosts and
432   // paths (even the default path of "/" is OK).
433   //
434   // We allow hosts with no length so we can handle file URLs, for example.
435   if (base_parsed.path.len <= 0) {
436     // On error, return the input (resolving a relative URL on a non-relative
437     // base = the base).
438     int base_len = base_parsed.Length();
439     for (int i = 0; i < base_len; i++)
440       output->push_back(base_url[i]);
441     return false;
442   }
443 
444   if (relative_component.len <= 0) {
445     // Empty relative URL, leave unchanged, only removing the ref component.
446     int base_len = base_parsed.Length();
447     base_len -= base_parsed.ref.len + 1;
448     out_parsed->ref.reset();
449     output->Append(base_url, base_len);
450     return true;
451   }
452 
453   int num_slashes = CountConsecutiveSlashes(
454       relative_url, relative_component.begin, relative_component.end());
455 
456 #ifdef WIN32
457   // On Windows, two slashes for a file path (regardless of which direction
458   // they are) means that it's UNC. Two backslashes on any base scheme mean
459   // that it's an absolute UNC path (we use the base_is_file flag to control
460   // how strict the UNC finder is).
461   //
462   // We also allow Windows absolute drive specs on any scheme (for example
463   // "c:\foo") like IE does. There must be no preceeding slashes in this
464   // case (we reject anything like "/c:/foo") because that should be treated
465   // as a path. For file URLs, we allow any number of slashes since that would
466   // be setting the path.
467   //
468   // This assumes the absolute path resolver handles absolute URLs like this
469   // properly. DoCanonicalize does this.
470   int after_slashes = relative_component.begin + num_slashes;
471   if (DoesBeginUNCPath(relative_url, relative_component.begin,
472                        relative_component.end(), !base_is_file) ||
473       ((num_slashes == 0 || base_is_file) &&
474        DoesBeginWindowsDriveSpec(
475            relative_url, after_slashes, relative_component.end()))) {
476     return DoResolveAbsoluteFile(relative_url, relative_component,
477                                  query_converter, output, out_parsed);
478   }
479 #else
480   // Other platforms need explicit handling for file: URLs with multiple
481   // slashes because the generic scheme parsing always extracts a host, but a
482   // file: URL only has a host if it has exactly 2 slashes. Even if it does
483   // have a host, we want to use the special host detection logic for file
484   // URLs provided by DoResolveAbsoluteFile(), as opposed to the generic host
485   // detection logic, for consistency with parsing file URLs from scratch.
486   // This also handles the special case where the URL is only slashes,
487   // since that doesn't have a host part either.
488   if (base_is_file &&
489       (num_slashes >= 2 || num_slashes == relative_component.len)) {
490     return DoResolveAbsoluteFile(relative_url, relative_component,
491                                  query_converter, output, out_parsed);
492   }
493 #endif
494 
495   // Any other double-slashes mean that this is relative to the scheme.
496   if (num_slashes >= 2) {
497     return DoResolveRelativeHost(base_url, base_parsed,
498                                  relative_url, relative_component,
499                                  query_converter, output, out_parsed);
500   }
501 
502   // When we get here, we know that the relative URL is on the same host.
503   return DoResolveRelativePath(base_url, base_parsed, base_is_file,
504                                relative_url, relative_component,
505                                query_converter, output, out_parsed);
506 }
507 
508 }  // namespace
509 
IsRelativeURL(const char * base,const Parsed & base_parsed,const char * fragment,int fragment_len,bool is_base_hierarchical,bool * is_relative,Component * relative_component)510 bool IsRelativeURL(const char* base,
511                    const Parsed& base_parsed,
512                    const char* fragment,
513                    int fragment_len,
514                    bool is_base_hierarchical,
515                    bool* is_relative,
516                    Component* relative_component) {
517   return DoIsRelativeURL<char>(
518       base, base_parsed, fragment, fragment_len, is_base_hierarchical,
519       is_relative, relative_component);
520 }
521 
IsRelativeURL(const char * base,const Parsed & base_parsed,const base::char16 * fragment,int fragment_len,bool is_base_hierarchical,bool * is_relative,Component * relative_component)522 bool IsRelativeURL(const char* base,
523                    const Parsed& base_parsed,
524                    const base::char16* fragment,
525                    int fragment_len,
526                    bool is_base_hierarchical,
527                    bool* is_relative,
528                    Component* relative_component) {
529   return DoIsRelativeURL<base::char16>(
530       base, base_parsed, fragment, fragment_len, is_base_hierarchical,
531       is_relative, relative_component);
532 }
533 
ResolveRelativeURL(const char * base_url,const Parsed & base_parsed,bool base_is_file,const char * relative_url,const Component & relative_component,CharsetConverter * query_converter,CanonOutput * output,Parsed * out_parsed)534 bool ResolveRelativeURL(const char* base_url,
535                         const Parsed& base_parsed,
536                         bool base_is_file,
537                         const char* relative_url,
538                         const Component& relative_component,
539                         CharsetConverter* query_converter,
540                         CanonOutput* output,
541                         Parsed* out_parsed) {
542   return DoResolveRelativeURL<char>(
543       base_url, base_parsed, base_is_file, relative_url,
544       relative_component, query_converter, output, out_parsed);
545 }
546 
ResolveRelativeURL(const char * base_url,const Parsed & base_parsed,bool base_is_file,const base::char16 * relative_url,const Component & relative_component,CharsetConverter * query_converter,CanonOutput * output,Parsed * out_parsed)547 bool ResolveRelativeURL(const char* base_url,
548                         const Parsed& base_parsed,
549                         bool base_is_file,
550                         const base::char16* relative_url,
551                         const Component& relative_component,
552                         CharsetConverter* query_converter,
553                         CanonOutput* output,
554                         Parsed* out_parsed) {
555   return DoResolveRelativeURL<base::char16>(
556       base_url, base_parsed, base_is_file, relative_url,
557       relative_component, query_converter, output, out_parsed);
558 }
559 
560 }  // namespace url
561