• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <string_view>
6 
7 #include "base/check.h"
8 #include "url/third_party/mozilla/url_parse.h"
9 #include "url/url_file.h"
10 #include "url/url_parse_internal.h"
11 
12 // Interesting IE file:isms...
13 //
14 //  INPUT                      OUTPUT
15 //  =========================  ==============================
16 //  file:/foo/bar              file:///foo/bar
17 //      The result here seems totally invalid!?!? This isn't UNC.
18 //
19 //  file:/
20 //  file:// or any other number of slashes
21 //      IE6 doesn't do anything at all if you click on this link. No error:
22 //      nothing. IE6's history system seems to always color this link, so I'm
23 //      guessing that it maps internally to the empty URL.
24 //
25 //  C:\                        file:///C:/
26 //      When on a file: URL source page, this link will work. When over HTTP,
27 //      the file: URL will appear in the status bar but the link will not work
28 //      (security restriction for all file URLs).
29 //
30 //  file:foo/                  file:foo/     (invalid?!?!?)
31 //  file:/foo/                 file:///foo/  (invalid?!?!?)
32 //  file://foo/                file://foo/   (UNC to server "foo")
33 //  file:///foo/               file:///foo/  (invalid, seems to be a file)
34 //  file:////foo/              file://foo/   (UNC to server "foo")
35 //      Any more than four slashes is also treated as UNC.
36 //
37 //  file:C:/                   file://C:/
38 //  file:/C:/                  file://C:/
39 //      The number of slashes after "file:" don't matter if the thing following
40 //      it looks like an absolute drive path. Also, slashes and backslashes are
41 //      equally valid here.
42 
43 namespace url {
44 
45 namespace {
46 
47 // Returns the index of the next slash in the input after the given index, or
48 // `spec.size()` if the end of the input is reached.
49 template <typename CharT>
FindNextSlash(std::basic_string_view<CharT> spec,size_t begin_index)50 size_t FindNextSlash(std::basic_string_view<CharT> spec, size_t begin_index) {
51   size_t idx = begin_index;
52   while (idx < spec.size() && !IsSlashOrBackslash(spec[idx])) {
53     idx++;
54   }
55   return idx;
56 }
57 
58 // A subcomponent of DoParseFileURL, the input of this function should be a UNC
59 // path name, with the index of the first character after the slashes following
60 // the scheme given in `after_slashes`. This will initialize the host, path,
61 // query, and ref, and leave the other output components untouched
62 // (DoParseFileURL handles these for us).
63 template <typename CharT>
DoParseUNC(std::basic_string_view<CharT> url,size_t after_slashes,Parsed * parsed)64 void DoParseUNC(std::basic_string_view<CharT> url,
65                 size_t after_slashes,
66                 Parsed* parsed) {
67   int url_len = base::checked_cast<int>(url.size());
68   // The cast is safe because `FindNextSlash` will never return anything longer
69   // than `url_len`.
70   int next_slash = static_cast<int>(FindNextSlash(url, after_slashes));
71 
72   // Everything up until that first slash we found (or end of string) is the
73   // host name, which will end up being the UNC host. For example,
74   // "file://foo/bar.txt" will get a server name of "foo" and a path of "/bar".
75   // Later, on Windows, this should be treated as the filename "\\foo\bar.txt"
76   // in proper UNC notation.
77   if (after_slashes < static_cast<size_t>(next_slash)) {
78     parsed->host = MakeRange(after_slashes, next_slash);
79   } else {
80     parsed->host.reset();
81   }
82   if (next_slash < url_len) {
83     ParsePathInternal(url.data(), MakeRange(next_slash, url_len), &parsed->path,
84                       &parsed->query, &parsed->ref);
85   } else {
86     parsed->path.reset();
87   }
88 }
89 
90 // A subcomponent of DoParseFileURL, the input should be a local file, with the
91 // beginning of the path indicated by the index in `path_begin`. This will
92 // initialize the host, path, query, and ref, and leave the other output
93 // components untouched (DoParseFileURL handles these for us).
94 template <typename CharT>
DoParseLocalFile(std::basic_string_view<CharT> url,int path_begin,Parsed * parsed)95 void DoParseLocalFile(std::basic_string_view<CharT> url,
96                       int path_begin,
97                       Parsed* parsed) {
98   parsed->host.reset();
99   ParsePathInternal(url.data(),
100                     MakeRange(path_begin, base::checked_cast<int>(url.size())),
101                     &parsed->path, &parsed->query, &parsed->ref);
102 }
103 
104 // Backend for the external functions that operates on either char type.
105 // Handles cases where there is a scheme, but also when handed the first
106 // character following the "file:" at the beginning of the spec. If so,
107 // this is usually a slash, but needn't be; we allow paths like "file:c:\foo".
108 template <typename CharT>
DoParseFileURL(std::basic_string_view<CharT> url)109 Parsed DoParseFileURL(std::basic_string_view<CharT> url) {
110   // Strip leading & trailing spaces and control characters.
111   int begin = 0;
112   int url_len = base::checked_cast<int>(url.size());
113   TrimURL(url.data(), &begin, &url_len);
114 
115   // Find the scheme, if any.
116   int num_slashes = CountConsecutiveSlashes(url.data(), begin, url_len);
117   int after_scheme;
118   size_t after_slashes;
119   Parsed parsed;
120 #ifdef WIN32
121   // See how many slashes there are. We want to handle cases like UNC but also
122   // "/c:/foo". This is when there is no scheme, so we can allow pages to do
123   // links like "c:/foo/bar" or "//foo/bar". This is also called by the
124   // relative URL resolver when it determines there is an absolute URL, which
125   // may give us input like "/c:/foo".
126   after_slashes = begin + num_slashes;
127   if (DoesBeginWindowsDriveSpec(url.data(), after_slashes, url_len)) {
128     // Windows path, don't try to extract the scheme (for example, "c:\foo").
129     after_scheme = after_slashes;
130   } else if (DoesBeginUNCPath(url.data(), begin, url_len, false)) {
131     // Windows UNC path: don't try to extract the scheme, but keep the slashes.
132     after_scheme = begin;
133   } else
134 #endif
135   {
136     // ExtractScheme doesn't understand the possibility of filenames with
137     // colons in them, in which case it returns the entire spec up to the
138     // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as
139     // the foo.c: scheme.
140     if (!num_slashes &&
141         ExtractScheme(&url[begin], url_len - begin, &parsed.scheme)) {
142       // Offset the results since we gave ExtractScheme a substring.
143       parsed.scheme.begin += begin;
144       after_scheme = parsed.scheme.end() + 1;
145     } else {
146       // No scheme found, remember that.
147       parsed.scheme.reset();
148       after_scheme = begin;
149     }
150   }
151 
152   // Handle empty specs ones that contain only whitespace or control chars,
153   // or that are just the scheme (for example "file:").
154   if (after_scheme == url_len) {
155     return parsed;
156   }
157 
158   num_slashes = CountConsecutiveSlashes(url.data(), after_scheme, url_len);
159   after_slashes = after_scheme + num_slashes;
160 #ifdef WIN32
161   // Check whether the input is a drive again. We checked above for windows
162   // drive specs, but that's only at the very beginning to see if we have a
163   // scheme at all. This test will be duplicated in that case, but will
164   // additionally handle all cases with a real scheme such as "file:///C:/".
165   if (!DoesBeginWindowsDriveSpec(url.data(), after_slashes, url_len) &&
166       num_slashes != 3) {
167     // Anything not beginning with a drive spec ("c:\") on Windows is treated
168     // as UNC, with the exception of three slashes which always means a file.
169     // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
170     DoParseUNC(url.substr(0, url_len), after_slashes, &parsed);
171     return parsed;
172   }
173 #else
174   // file: URL with exactly 2 slashes is considered to have a host component.
175   if (num_slashes == 2) {
176     DoParseUNC(url.substr(0, url_len), after_slashes, &parsed);
177     return parsed;
178   }
179 #endif  // WIN32
180 
181   // Easy and common case, the full path immediately follows the scheme
182   // (modulo slashes), as in "file://c:/foo". Just treat everything from
183   // there to the end as the path. Empty hosts have 0 length instead of -1.
184   // We include the last slash as part of the path if there is one.
185   DoParseLocalFile(
186       url.substr(0, url_len),
187       num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, &parsed);
188   return parsed;
189 }
190 
191 }  // namespace
192 
ParseFileURL(std::string_view url)193 Parsed ParseFileURL(std::string_view url) {
194   return DoParseFileURL(url);
195 }
196 
ParseFileURL(std::u16string_view url)197 Parsed ParseFileURL(std::u16string_view url) {
198   return DoParseFileURL(url);
199 }
200 
201 }  // namespace url
202