1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <string_view>
6
7 #include "base/check.h"
8 #include "url/third_party/mozilla/url_parse.h"
9 #include "url/url_file.h"
10 #include "url/url_parse_internal.h"
11
12 // Interesting IE file:isms...
13 //
14 // INPUT OUTPUT
15 // ========================= ==============================
16 // file:/foo/bar file:///foo/bar
17 // The result here seems totally invalid!?!? This isn't UNC.
18 //
19 // file:/
20 // file:// or any other number of slashes
21 // IE6 doesn't do anything at all if you click on this link. No error:
22 // nothing. IE6's history system seems to always color this link, so I'm
23 // guessing that it maps internally to the empty URL.
24 //
25 // C:\ file:///C:/
26 // When on a file: URL source page, this link will work. When over HTTP,
27 // the file: URL will appear in the status bar but the link will not work
28 // (security restriction for all file URLs).
29 //
30 // file:foo/ file:foo/ (invalid?!?!?)
31 // file:/foo/ file:///foo/ (invalid?!?!?)
32 // file://foo/ file://foo/ (UNC to server "foo")
33 // file:///foo/ file:///foo/ (invalid, seems to be a file)
34 // file:////foo/ file://foo/ (UNC to server "foo")
35 // Any more than four slashes is also treated as UNC.
36 //
37 // file:C:/ file://C:/
38 // file:/C:/ file://C:/
39 // The number of slashes after "file:" don't matter if the thing following
40 // it looks like an absolute drive path. Also, slashes and backslashes are
41 // equally valid here.
42
43 namespace url {
44
45 namespace {
46
47 // Returns the index of the next slash in the input after the given index, or
48 // `spec.size()` if the end of the input is reached.
49 template <typename CharT>
FindNextSlash(std::basic_string_view<CharT> spec,size_t begin_index)50 size_t FindNextSlash(std::basic_string_view<CharT> spec, size_t begin_index) {
51 size_t idx = begin_index;
52 while (idx < spec.size() && !IsSlashOrBackslash(spec[idx])) {
53 idx++;
54 }
55 return idx;
56 }
57
58 // A subcomponent of DoParseFileURL, the input of this function should be a UNC
59 // path name, with the index of the first character after the slashes following
60 // the scheme given in `after_slashes`. This will initialize the host, path,
61 // query, and ref, and leave the other output components untouched
62 // (DoParseFileURL handles these for us).
63 template <typename CharT>
DoParseUNC(std::basic_string_view<CharT> url,size_t after_slashes,Parsed * parsed)64 void DoParseUNC(std::basic_string_view<CharT> url,
65 size_t after_slashes,
66 Parsed* parsed) {
67 int url_len = base::checked_cast<int>(url.size());
68 // The cast is safe because `FindNextSlash` will never return anything longer
69 // than `url_len`.
70 int next_slash = static_cast<int>(FindNextSlash(url, after_slashes));
71
72 // Everything up until that first slash we found (or end of string) is the
73 // host name, which will end up being the UNC host. For example,
74 // "file://foo/bar.txt" will get a server name of "foo" and a path of "/bar".
75 // Later, on Windows, this should be treated as the filename "\\foo\bar.txt"
76 // in proper UNC notation.
77 if (after_slashes < static_cast<size_t>(next_slash)) {
78 parsed->host = MakeRange(after_slashes, next_slash);
79 } else {
80 parsed->host.reset();
81 }
82 if (next_slash < url_len) {
83 ParsePathInternal(url.data(), MakeRange(next_slash, url_len), &parsed->path,
84 &parsed->query, &parsed->ref);
85 } else {
86 parsed->path.reset();
87 }
88 }
89
90 // A subcomponent of DoParseFileURL, the input should be a local file, with the
91 // beginning of the path indicated by the index in `path_begin`. This will
92 // initialize the host, path, query, and ref, and leave the other output
93 // components untouched (DoParseFileURL handles these for us).
94 template <typename CharT>
DoParseLocalFile(std::basic_string_view<CharT> url,int path_begin,Parsed * parsed)95 void DoParseLocalFile(std::basic_string_view<CharT> url,
96 int path_begin,
97 Parsed* parsed) {
98 parsed->host.reset();
99 ParsePathInternal(url.data(),
100 MakeRange(path_begin, base::checked_cast<int>(url.size())),
101 &parsed->path, &parsed->query, &parsed->ref);
102 }
103
104 // Backend for the external functions that operates on either char type.
105 // Handles cases where there is a scheme, but also when handed the first
106 // character following the "file:" at the beginning of the spec. If so,
107 // this is usually a slash, but needn't be; we allow paths like "file:c:\foo".
108 template <typename CharT>
DoParseFileURL(std::basic_string_view<CharT> url)109 Parsed DoParseFileURL(std::basic_string_view<CharT> url) {
110 // Strip leading & trailing spaces and control characters.
111 int begin = 0;
112 int url_len = base::checked_cast<int>(url.size());
113 TrimURL(url.data(), &begin, &url_len);
114
115 // Find the scheme, if any.
116 int num_slashes = CountConsecutiveSlashes(url.data(), begin, url_len);
117 int after_scheme;
118 size_t after_slashes;
119 Parsed parsed;
120 #ifdef WIN32
121 // See how many slashes there are. We want to handle cases like UNC but also
122 // "/c:/foo". This is when there is no scheme, so we can allow pages to do
123 // links like "c:/foo/bar" or "//foo/bar". This is also called by the
124 // relative URL resolver when it determines there is an absolute URL, which
125 // may give us input like "/c:/foo".
126 after_slashes = begin + num_slashes;
127 if (DoesBeginWindowsDriveSpec(url.data(), after_slashes, url_len)) {
128 // Windows path, don't try to extract the scheme (for example, "c:\foo").
129 after_scheme = after_slashes;
130 } else if (DoesBeginUNCPath(url.data(), begin, url_len, false)) {
131 // Windows UNC path: don't try to extract the scheme, but keep the slashes.
132 after_scheme = begin;
133 } else
134 #endif
135 {
136 // ExtractScheme doesn't understand the possibility of filenames with
137 // colons in them, in which case it returns the entire spec up to the
138 // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as
139 // the foo.c: scheme.
140 if (!num_slashes &&
141 ExtractScheme(&url[begin], url_len - begin, &parsed.scheme)) {
142 // Offset the results since we gave ExtractScheme a substring.
143 parsed.scheme.begin += begin;
144 after_scheme = parsed.scheme.end() + 1;
145 } else {
146 // No scheme found, remember that.
147 parsed.scheme.reset();
148 after_scheme = begin;
149 }
150 }
151
152 // Handle empty specs ones that contain only whitespace or control chars,
153 // or that are just the scheme (for example "file:").
154 if (after_scheme == url_len) {
155 return parsed;
156 }
157
158 num_slashes = CountConsecutiveSlashes(url.data(), after_scheme, url_len);
159 after_slashes = after_scheme + num_slashes;
160 #ifdef WIN32
161 // Check whether the input is a drive again. We checked above for windows
162 // drive specs, but that's only at the very beginning to see if we have a
163 // scheme at all. This test will be duplicated in that case, but will
164 // additionally handle all cases with a real scheme such as "file:///C:/".
165 if (!DoesBeginWindowsDriveSpec(url.data(), after_slashes, url_len) &&
166 num_slashes != 3) {
167 // Anything not beginning with a drive spec ("c:\") on Windows is treated
168 // as UNC, with the exception of three slashes which always means a file.
169 // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
170 DoParseUNC(url.substr(0, url_len), after_slashes, &parsed);
171 return parsed;
172 }
173 #else
174 // file: URL with exactly 2 slashes is considered to have a host component.
175 if (num_slashes == 2) {
176 DoParseUNC(url.substr(0, url_len), after_slashes, &parsed);
177 return parsed;
178 }
179 #endif // WIN32
180
181 // Easy and common case, the full path immediately follows the scheme
182 // (modulo slashes), as in "file://c:/foo". Just treat everything from
183 // there to the end as the path. Empty hosts have 0 length instead of -1.
184 // We include the last slash as part of the path if there is one.
185 DoParseLocalFile(
186 url.substr(0, url_len),
187 num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, &parsed);
188 return parsed;
189 }
190
191 } // namespace
192
ParseFileURL(std::string_view url)193 Parsed ParseFileURL(std::string_view url) {
194 return DoParseFileURL(url);
195 }
196
ParseFileURL(std::u16string_view url)197 Parsed ParseFileURL(std::u16string_view url) {
198 return DoParseFileURL(url);
199 }
200
201 } // namespace url
202