1 /* Based on nsURLParsers.cc from Mozilla
2 * -------------------------------------
3 * The contents of this file are subject to the Mozilla Public License Version
4 * 1.1 (the "License"); you may not use this file except in compliance with
5 * the License. You may obtain a copy of the License at
6 * http://www.mozilla.org/MPL/
7 *
8 * Software distributed under the License is distributed on an "AS IS" basis,
9 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10 * for the specific language governing rights and limitations under the
11 * License.
12 *
13 * The Original Code is mozilla.org code.
14 *
15 * The Initial Developer of the Original Code is
16 * Netscape Communications Corporation.
17 * Portions created by the Initial Developer are Copyright (C) 1998
18 * the Initial Developer. All Rights Reserved.
19 *
20 * Contributor(s):
21 * Darin Fisher (original author)
22 *
23 * Alternatively, the contents of this file may be used under the terms of
24 * either the GNU General Public License Version 2 or later (the "GPL"), or
25 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 * in which case the provisions of the GPL or the LGPL are applicable instead
27 * of those above. If you wish to allow use of your version of this file only
28 * under the terms of either the GPL or the LGPL, and not to allow others to
29 * use your version of this file under the terms of the MPL, indicate your
30 * decision by deleting the provisions above and replace them with the notice
31 * and other provisions required by the GPL or the LGPL. If you do not delete
32 * the provisions above, a recipient may use your version of this file under
33 * the terms of any one of the MPL, the GPL or the LGPL.
34 *
35 * ***** END LICENSE BLOCK ***** */
36
37 #include "url/third_party/mozilla/url_parse.h"
38
39 #include <stdlib.h>
40
41 #include <ostream>
42
43 #include "base/check_op.h"
44 #include "url/url_parse_internal.h"
45 #include "url/url_util.h"
46 #include "url/url_util_internal.h"
47
48 namespace url {
49
operator <<(std::ostream & os,const Parsed & parsed)50 std::ostream& operator<<(std::ostream& os, const Parsed& parsed) {
51 return os << "{ scheme: " << parsed.scheme
52 << ", username: " << parsed.username
53 << ", password: " << parsed.password << ", host: " << parsed.host
54 << ", port: " << parsed.port << ", path: " << parsed.path
55 << ", query: " << parsed.query << ", ref: " << parsed.ref
56 << ", has_opaque_path: " << parsed.has_opaque_path << " }";
57 }
58
59 namespace {
60
61 // Returns true if the given character is a valid digit to use in a port.
IsPortDigit(char16_t ch)62 inline bool IsPortDigit(char16_t ch) {
63 return ch >= '0' && ch <= '9';
64 }
65
66 // Returns the offset of the next authority terminator in the input starting
67 // from start_offset. If no terminator is found, the return value will be equal
68 // to spec_len.
69 template <typename CHAR>
FindNextAuthorityTerminator(const CHAR * spec,int start_offset,int spec_len,ParserMode parser_mode)70 int FindNextAuthorityTerminator(const CHAR* spec,
71 int start_offset,
72 int spec_len,
73 ParserMode parser_mode) {
74 for (int i = start_offset; i < spec_len; i++) {
75 if (IsAuthorityTerminator(spec[i], parser_mode)) {
76 return i;
77 }
78 }
79 return spec_len; // Not found.
80 }
81
82 template <typename CHAR>
ParseUserInfo(const CHAR * spec,const Component & user,Component * username,Component * password)83 void ParseUserInfo(const CHAR* spec,
84 const Component& user,
85 Component* username,
86 Component* password) {
87 // Find the first colon in the user section, which separates the username and
88 // password.
89 int colon_offset = 0;
90 while (colon_offset < user.len && spec[user.begin + colon_offset] != ':')
91 colon_offset++;
92
93 if (colon_offset < user.len) {
94 // Found separator: <username>:<password>
95 *username = Component(user.begin, colon_offset);
96 *password = MakeRange(user.begin + colon_offset + 1, user.begin + user.len);
97 } else {
98 // No separator, treat everything as the username
99 *username = user;
100 *password = Component();
101 }
102 }
103
104 template <typename CHAR>
ParseServerInfo(const CHAR * spec,const Component & serverinfo,Component * hostname,Component * port_num)105 void ParseServerInfo(const CHAR* spec,
106 const Component& serverinfo,
107 Component* hostname,
108 Component* port_num) {
109 if (serverinfo.len == 0) {
110 // No server info, host name is empty.
111 hostname->reset();
112 port_num->reset();
113 return;
114 }
115
116 // If the host starts with a left-bracket, assume the entire host is an
117 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal.
118 // This assumption will be overridden if we find a right-bracket.
119 //
120 // Our IPv6 address canonicalization code requires both brackets to exist,
121 // but the ability to locate an incomplete address can still be useful.
122 int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
123 int colon = -1;
124
125 // Find the last right-bracket, and the last colon.
126 for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
127 switch (spec[i]) {
128 case ']':
129 ipv6_terminator = i;
130 break;
131 case ':':
132 colon = i;
133 break;
134 }
135 }
136
137 if (colon > ipv6_terminator) {
138 // Found a port number: <hostname>:<port>
139 *hostname = MakeRange(serverinfo.begin, colon);
140 if (hostname->len == 0)
141 hostname->reset();
142 *port_num = MakeRange(colon + 1, serverinfo.end());
143 } else {
144 // No port: <hostname>
145 *hostname = serverinfo;
146 port_num->reset();
147 }
148 }
149
150 // Given an already-identified auth section, breaks it into its consituent
151 // parts. The port number will be parsed and the resulting integer will be
152 // filled into the given *port variable, or -1 if there is no port number or it
153 // is invalid.
154 template <typename CHAR>
DoParseAuthority(const CHAR * spec,const Component & auth,ParserMode parser_mode,Component * username,Component * password,Component * hostname,Component * port_num)155 void DoParseAuthority(const CHAR* spec,
156 const Component& auth,
157 ParserMode parser_mode,
158 Component* username,
159 Component* password,
160 Component* hostname,
161 Component* port_num) {
162 DCHECK(auth.is_valid()) << "We should always get an authority";
163 if (auth.len == 0) {
164 username->reset();
165 password->reset();
166 if (parser_mode == ParserMode::kSpecialURL) {
167 hostname->reset();
168 } else {
169 // Non-special URLs can have an empty host. The difference between "host
170 // is empty" and "host does not exist" matters in the canonicalization
171 // phase.
172 //
173 // Examples:
174 // - "git:///" => host is empty (this case).
175 // - "git:/" => host does not exist.
176 *hostname = Component(auth.begin, 0);
177 }
178 port_num->reset();
179 return;
180 }
181
182 // Search backwards for @, which is the separator between the user info and
183 // the server info.
184 int i = auth.begin + auth.len - 1;
185 while (i > auth.begin && spec[i] != '@')
186 i--;
187
188 if (spec[i] == '@') {
189 // Found user info: <user-info>@<server-info>
190 ParseUserInfo(spec, Component(auth.begin, i - auth.begin), username,
191 password);
192 ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), hostname,
193 port_num);
194 } else {
195 // No user info, everything is server info.
196 username->reset();
197 password->reset();
198 ParseServerInfo(spec, auth, hostname, port_num);
199 }
200 }
201
202 template <typename CHAR>
FindQueryAndRefParts(const CHAR * spec,const Component & path,int * query_separator,int * ref_separator)203 inline void FindQueryAndRefParts(const CHAR* spec,
204 const Component& path,
205 int* query_separator,
206 int* ref_separator) {
207 if constexpr (sizeof(*spec) == 1) {
208 // memchr is much faster than any scalar code we can write.
209 const CHAR* ptr = spec + path.begin;
210 const CHAR* first_hash =
211 reinterpret_cast<const CHAR*>(memchr(ptr, '#', path.len));
212 size_t len_before_fragment =
213 first_hash == nullptr ? path.len : first_hash - ptr;
214 const CHAR* first_question =
215 reinterpret_cast<const CHAR*>(memchr(ptr, '?', len_before_fragment));
216 if (first_hash != nullptr) {
217 *ref_separator = first_hash - spec;
218 }
219 if (first_question != nullptr) {
220 *query_separator = first_question - spec;
221 }
222 } else {
223 int path_end = path.begin + path.len;
224 for (int i = path.begin; i < path_end; i++) {
225 switch (spec[i]) {
226 case '?':
227 // Only match the query string if it precedes the reference fragment
228 // and when we haven't found one already.
229 if (*query_separator < 0)
230 *query_separator = i;
231 break;
232 case '#':
233 // Record the first # sign only.
234 if (*ref_separator < 0) {
235 *ref_separator = i;
236 return;
237 }
238 break;
239 }
240 }
241 }
242 }
243
244 template <typename CHAR>
ParsePath(const CHAR * spec,const Component & path,Component * filepath,Component * query,Component * ref)245 void ParsePath(const CHAR* spec,
246 const Component& path,
247 Component* filepath,
248 Component* query,
249 Component* ref) {
250 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref>
251 DCHECK(path.is_valid());
252
253 // Search for first occurrence of either ? or #.
254 int query_separator = -1; // Index of the '?'
255 int ref_separator = -1; // Index of the '#'
256 FindQueryAndRefParts(spec, path, &query_separator, &ref_separator);
257
258 // Markers pointing to the character after each of these corresponding
259 // components. The code below words from the end back to the beginning,
260 // and will update these indices as it finds components that exist.
261 int file_end, query_end;
262
263 // Ref fragment: from the # to the end of the path.
264 int path_end = path.begin + path.len;
265 if (ref_separator >= 0) {
266 file_end = query_end = ref_separator;
267 *ref = MakeRange(ref_separator + 1, path_end);
268 } else {
269 file_end = query_end = path_end;
270 ref->reset();
271 }
272
273 // Query fragment: everything from the ? to the next boundary (either the end
274 // of the path or the ref fragment).
275 if (query_separator >= 0) {
276 file_end = query_separator;
277 *query = MakeRange(query_separator + 1, query_end);
278 } else {
279 query->reset();
280 }
281
282 if (file_end != path.begin) {
283 *filepath = MakeRange(path.begin, file_end);
284 } else {
285 // File path: treat an empty file path as no file path.
286 //
287 // TODO(crbug.com/1416006): Consider to assign zero-length path component
288 // for non-special URLs because a path can be empty in non-special URLs.
289 // Currently, we don't have to distinguish between them. There is no visible
290 // difference.
291 filepath->reset();
292 }
293 }
294
295 template <typename CHAR>
DoExtractScheme(const CHAR * url,int url_len,Component * scheme)296 bool DoExtractScheme(const CHAR* url, int url_len, Component* scheme) {
297 // Skip leading whitespace and control characters.
298 int begin = 0;
299 while (begin < url_len && ShouldTrimFromURL(url[begin]))
300 begin++;
301 if (begin == url_len)
302 return false; // Input is empty or all whitespace.
303
304 // Find the first colon character.
305 for (int i = begin; i < url_len; i++) {
306 if (url[i] == ':') {
307 *scheme = MakeRange(begin, i);
308 return true;
309 }
310 }
311 return false; // No colon found: no scheme
312 }
313
314 // Fills in all members of the Parsed structure except for the scheme.
315 //
316 // |spec| is the full spec being parsed, of length |spec_len|.
317 // |after_scheme| is the character immediately following the scheme (after the
318 // colon) where we'll begin parsing.
319 //
320 // Compatability data points. I list "host", "path" extracted:
321 // Input IE6 Firefox Us
322 // ----- -------------- -------------- --------------
323 // http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
324 // http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
325 // http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/"
326 // http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/"
327 // http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
328 //
329 // (*) Interestingly, although IE fails to load these URLs, its history
330 // canonicalizer handles them, meaning if you've been to the corresponding
331 // "http://foo.com/" link, it will be colored.
332 template <typename CHAR>
DoParseAfterSpecialScheme(const CHAR * spec,int spec_len,int after_scheme,Parsed * parsed)333 void DoParseAfterSpecialScheme(const CHAR* spec,
334 int spec_len,
335 int after_scheme,
336 Parsed* parsed) {
337 int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
338 int after_slashes = after_scheme + num_slashes;
339
340 // First split into two main parts, the authority (username, password, host,
341 // and port) and the full path (path, query, and reference).
342 //
343 // Treat everything from `after_slashes` to the next slash (or end of spec) to
344 // be the authority. Note that we ignore the number of slashes and treat it as
345 // the authority.
346 int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len,
347 ParserMode::kSpecialURL);
348
349 Component authority(after_slashes, end_auth - after_slashes);
350 // Everything starting from the slash to the end is the path.
351 Component full_path(end_auth, spec_len - end_auth);
352
353 // Now parse those two sub-parts.
354 DoParseAuthority(spec, authority, ParserMode::kSpecialURL, &parsed->username,
355 &parsed->password, &parsed->host, &parsed->port);
356 ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
357 }
358
359 // The main parsing function for standard URLs. Standard URLs have a scheme,
360 // host, path, etc.
361 template <typename CHAR>
DoParseStandardURL(const CHAR * spec,int spec_len,Parsed * parsed)362 void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
363 DCHECK(spec_len >= 0);
364 parsed->has_opaque_path = false;
365
366 // Strip leading & trailing spaces and control characters.
367 int begin = 0;
368 TrimURL(spec, &begin, &spec_len);
369
370 int after_scheme;
371 if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
372 after_scheme = parsed->scheme.end() + 1; // Skip past the colon.
373 } else {
374 // Say there's no scheme when there is no colon. We could also say that
375 // everything is the scheme. Both would produce an invalid URL, but this way
376 // seems less wrong in more cases.
377 parsed->scheme.reset();
378 after_scheme = begin;
379 }
380 DoParseAfterSpecialScheme(spec, spec_len, after_scheme, parsed);
381 }
382
383 template <typename CHAR>
DoParseAfterNonSpecialScheme(const CHAR * spec,int spec_len,int after_scheme,Parsed * parsed)384 void DoParseAfterNonSpecialScheme(const CHAR* spec,
385 int spec_len,
386 int after_scheme,
387 Parsed* parsed) {
388 // The implementation is similar to `DoParseAfterSpecialScheme()`, but there
389 // are many subtle differences. So we have a different function for parsing
390 // non-special URLs.
391
392 int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
393
394 if (num_slashes >= 2) {
395 // Found "//<some data>", looks like an authority section.
396 //
397 // e.g.
398 // "git://host:8000/path"
399 // ^
400 //
401 // The state machine transition in the URL Standard is:
402 //
403 // https://url.spec.whatwg.org/#scheme-state
404 // => https://url.spec.whatwg.org/#path-or-authority-state
405 // => https://url.spec.whatwg.org/#authority-state
406 //
407 parsed->has_opaque_path = false;
408
409 int after_slashes = after_scheme + 2;
410
411 // First split into two main parts, the authority (username, password, host,
412 // and port) and the full path (path, query, and reference).
413 //
414 // Treat everything from there to the next slash (or end of spec) to be the
415 // authority. Note that we ignore the number of slashes and treat it as the
416 // authority.
417 int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len,
418 ParserMode::kNonSpecialURL);
419 Component authority(after_slashes, end_auth - after_slashes);
420
421 // Now parse those two sub-parts.
422 DoParseAuthority(spec, authority, ParserMode::kNonSpecialURL,
423 &parsed->username, &parsed->password, &parsed->host,
424 &parsed->port);
425
426 // Everything starting from the slash to the end is the path.
427 Component full_path(end_auth, spec_len - end_auth);
428 ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
429 return;
430 }
431
432 if (num_slashes == 1) {
433 // Examples:
434 // "git:/path"
435 // ^
436 //
437 // The state machine transition in the URL Standard is:
438 //
439 // https://url.spec.whatwg.org/#scheme-state
440 // => https://url.spec.whatwg.org/#path-or-authority-state
441 // => https://url.spec.whatwg.org/#path-state
442 parsed->has_opaque_path = false;
443 } else {
444 // We didn't found "//" nor "/", so entering into an opaque-path-state.
445 //
446 // Examples:
447 // "git:opaque path"
448 // ^
449 //
450 // The state machine transition in the URL Standard is:
451 //
452 // https://url.spec.whatwg.org/#scheme-state
453 // => https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
454 parsed->has_opaque_path = true;
455 }
456
457 parsed->username.reset();
458 parsed->password.reset();
459 // It's important to reset `parsed->host` here to distinguish between "host
460 // is empty" and "host doesn't exist".
461 parsed->host.reset();
462 parsed->port.reset();
463
464 // Everything starting after scheme to the end is the path.
465 Component full_path(after_scheme, spec_len - after_scheme);
466 ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
467 }
468
469 // The main parsing function for non-special scheme URLs.
470 template <typename CHAR>
DoParseNonSpecialURL(const CHAR * spec,int spec_len,Parsed * parsed)471 void DoParseNonSpecialURL(const CHAR* spec, int spec_len, Parsed* parsed) {
472 DCHECK(spec_len >= 0);
473
474 // Strip leading & trailing spaces and control characters.
475 int begin = 0;
476 TrimURL(spec, &begin, &spec_len);
477
478 int after_scheme;
479 if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
480 after_scheme = parsed->scheme.end() + 1; // Skip past the colon.
481 } else {
482 // Say there's no scheme when there is no colon. We could also say that
483 // everything is the scheme. Both would produce an invalid URL, but this way
484 // seems less wrong in more cases.
485 parsed->scheme.reset();
486 after_scheme = 0;
487 }
488 DoParseAfterNonSpecialScheme(spec, spec_len, after_scheme, parsed);
489 }
490
491 template <typename CHAR>
DoParseFileSystemURL(const CHAR * spec,int spec_len,Parsed * parsed)492 void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
493 DCHECK(spec_len >= 0);
494
495 // Get the unused parts of the URL out of the way.
496 parsed->username.reset();
497 parsed->password.reset();
498 parsed->host.reset();
499 parsed->port.reset();
500 parsed->path.reset(); // May use this; reset for convenience.
501 parsed->ref.reset(); // May use this; reset for convenience.
502 parsed->query.reset(); // May use this; reset for convenience.
503 parsed->clear_inner_parsed(); // May use this; reset for convenience.
504 parsed->has_opaque_path = false;
505
506 // Strip leading & trailing spaces and control characters.
507 int begin = 0;
508 TrimURL(spec, &begin, &spec_len);
509
510 // Handle empty specs or ones that contain only whitespace or control chars.
511 if (begin == spec_len) {
512 parsed->scheme.reset();
513 return;
514 }
515
516 int inner_start = -1;
517
518 // Extract the scheme. We also handle the case where there is no scheme.
519 if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
520 // Offset the results since we gave ExtractScheme a substring.
521 parsed->scheme.begin += begin;
522
523 if (parsed->scheme.end() == spec_len - 1)
524 return;
525
526 inner_start = parsed->scheme.end() + 1;
527 } else {
528 // No scheme found; that's not valid for filesystem URLs.
529 parsed->scheme.reset();
530 return;
531 }
532
533 Component inner_scheme;
534 const CHAR* inner_spec = &spec[inner_start];
535 int inner_spec_len = spec_len - inner_start;
536
537 if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) {
538 // Offset the results since we gave ExtractScheme a substring.
539 inner_scheme.begin += inner_start;
540
541 if (inner_scheme.end() == spec_len - 1)
542 return;
543 } else {
544 // No scheme found; that's not valid for filesystem URLs.
545 // The best we can do is return "filesystem://".
546 return;
547 }
548
549 Parsed inner_parsed;
550
551 if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) {
552 // File URLs are special.
553 ParseFileURL(inner_spec, inner_spec_len, &inner_parsed);
554 } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) {
555 // Filesystem URLs don't nest.
556 return;
557 } else if (IsStandard(spec, inner_scheme)) {
558 // All "normal" URLs.
559 DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed);
560 } else {
561 return;
562 }
563
564 // All members of inner_parsed need to be offset by inner_start.
565 // If we had any scheme that supported nesting more than one level deep,
566 // we'd have to recurse into the inner_parsed's inner_parsed when
567 // adjusting by inner_start.
568 inner_parsed.scheme.begin += inner_start;
569 inner_parsed.username.begin += inner_start;
570 inner_parsed.password.begin += inner_start;
571 inner_parsed.host.begin += inner_start;
572 inner_parsed.port.begin += inner_start;
573 inner_parsed.query.begin += inner_start;
574 inner_parsed.ref.begin += inner_start;
575 inner_parsed.path.begin += inner_start;
576
577 // Query and ref move from inner_parsed to parsed.
578 parsed->query = inner_parsed.query;
579 inner_parsed.query.reset();
580 parsed->ref = inner_parsed.ref;
581 inner_parsed.ref.reset();
582
583 parsed->set_inner_parsed(inner_parsed);
584 if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() ||
585 inner_parsed.inner_parsed()) {
586 return;
587 }
588
589 // The path in inner_parsed should start with a slash, then have a filesystem
590 // type followed by a slash. From the first slash up to but excluding the
591 // second should be what it keeps; the rest goes to parsed. If the path ends
592 // before the second slash, it's still pretty clear what the user meant, so
593 // we'll let that through.
594 if (!IsSlashOrBackslash(spec[inner_parsed.path.begin])) {
595 return;
596 }
597 int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash
598 while (inner_path_end < spec_len &&
599 !IsSlashOrBackslash(spec[inner_path_end])) {
600 ++inner_path_end;
601 }
602 parsed->path.begin = inner_path_end;
603 int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
604 parsed->path.len = inner_parsed.path.len - new_inner_path_length;
605 parsed->inner_parsed()->path.len = new_inner_path_length;
606 }
607
608 // Initializes a path URL which is merely a scheme followed by a path. Examples
609 // include "about:foo" and "javascript:alert('bar');"
610 template <typename CHAR>
DoParsePathURL(const CHAR * spec,int spec_len,bool trim_path_end,Parsed * parsed)611 void DoParsePathURL(const CHAR* spec,
612 int spec_len,
613 bool trim_path_end,
614 Parsed* parsed) {
615 // Get the non-path and non-scheme parts of the URL out of the way, we never
616 // use them.
617 parsed->username.reset();
618 parsed->password.reset();
619 parsed->host.reset();
620 parsed->port.reset();
621 parsed->path.reset();
622 parsed->query.reset();
623 parsed->ref.reset();
624 // In practice, we don't need to set `has_opaque_path` here because:
625 //
626 // 1. `has_opaque_path` will be used only when the
627 // `kStandardCompliantNonSpecialSchemeURLParsing` feature is enabled.
628 // 2. `DoParsePathURL` will not be used when the flag is enabled (planned).
629 //
630 // However, for predictable results, it is better to explicitly set it
631 // `false`.
632 parsed->has_opaque_path = false;
633
634 // Strip leading & trailing spaces and control characters.
635 int scheme_begin = 0;
636 TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
637
638 // Handle empty specs or ones that contain only whitespace or control chars.
639 if (scheme_begin == spec_len) {
640 parsed->scheme.reset();
641 parsed->path.reset();
642 return;
643 }
644
645 int path_begin;
646 // Extract the scheme, with the path being everything following. We also
647 // handle the case where there is no scheme.
648 if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
649 &parsed->scheme)) {
650 // Offset the results since we gave ExtractScheme a substring.
651 parsed->scheme.begin += scheme_begin;
652 path_begin = parsed->scheme.end() + 1;
653 } else {
654 // No scheme case.
655 parsed->scheme.reset();
656 path_begin = scheme_begin;
657 }
658
659 if (path_begin == spec_len)
660 return;
661 DCHECK_LT(path_begin, spec_len);
662
663 ParsePath(spec, MakeRange(path_begin, spec_len), &parsed->path,
664 &parsed->query, &parsed->ref);
665 }
666
667 template <typename CHAR>
DoParseMailtoURL(const CHAR * spec,int spec_len,Parsed * parsed)668 void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
669 DCHECK(spec_len >= 0);
670
671 // Get the non-path and non-scheme parts of the URL out of the way, we never
672 // use them.
673 parsed->username.reset();
674 parsed->password.reset();
675 parsed->host.reset();
676 parsed->port.reset();
677 parsed->ref.reset();
678 parsed->query.reset(); // May use this; reset for convenience.
679 parsed->has_opaque_path = false;
680
681 // Strip leading & trailing spaces and control characters.
682 int begin = 0;
683 TrimURL(spec, &begin, &spec_len);
684
685 // Handle empty specs or ones that contain only whitespace or control chars.
686 if (begin == spec_len) {
687 parsed->scheme.reset();
688 parsed->path.reset();
689 return;
690 }
691
692 int path_begin = -1;
693 int path_end = -1;
694
695 // Extract the scheme, with the path being everything following. We also
696 // handle the case where there is no scheme.
697 if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
698 // Offset the results since we gave ExtractScheme a substring.
699 parsed->scheme.begin += begin;
700
701 if (parsed->scheme.end() != spec_len - 1) {
702 path_begin = parsed->scheme.end() + 1;
703 path_end = spec_len;
704 }
705 } else {
706 // No scheme found, just path.
707 parsed->scheme.reset();
708 path_begin = begin;
709 path_end = spec_len;
710 }
711
712 // Split [path_begin, path_end) into a path + query.
713 for (int i = path_begin; i < path_end; ++i) {
714 if (spec[i] == '?') {
715 parsed->query = MakeRange(i + 1, path_end);
716 path_end = i;
717 break;
718 }
719 }
720
721 // For compatability with the standard URL parser, treat no path as
722 // -1, rather than having a length of 0
723 if (path_begin == path_end) {
724 parsed->path.reset();
725 } else {
726 parsed->path = MakeRange(path_begin, path_end);
727 }
728 }
729
730 // Converts a port number in a string to an integer. We'd like to just call
731 // sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
732 // we copy the digits to a small stack buffer (since we know the maximum number
733 // of digits in a valid port number) that we can NULL terminate.
734 template <typename CHAR>
DoParsePort(const CHAR * spec,const Component & component)735 int DoParsePort(const CHAR* spec, const Component& component) {
736 // Easy success case when there is no port.
737 const int kMaxDigits = 5;
738 if (component.is_empty())
739 return PORT_UNSPECIFIED;
740
741 // Skip over any leading 0s.
742 Component digits_comp(component.end(), 0);
743 for (int i = 0; i < component.len; i++) {
744 if (spec[component.begin + i] != '0') {
745 digits_comp = MakeRange(component.begin + i, component.end());
746 break;
747 }
748 }
749 if (digits_comp.len == 0)
750 return 0; // All digits were 0.
751
752 // Verify we don't have too many digits (we'll be copying to our buffer so
753 // we need to double-check).
754 if (digits_comp.len > kMaxDigits)
755 return PORT_INVALID;
756
757 // Copy valid digits to the buffer.
758 char digits[kMaxDigits + 1]; // +1 for null terminator
759 for (int i = 0; i < digits_comp.len; i++) {
760 CHAR ch = spec[digits_comp.begin + i];
761 if (!IsPortDigit(ch)) {
762 // Invalid port digit, fail.
763 return PORT_INVALID;
764 }
765 digits[i] = static_cast<char>(ch);
766 }
767
768 // Null-terminate the string and convert to integer. Since we guarantee
769 // only digits, atoi's lack of error handling is OK.
770 digits[digits_comp.len] = 0;
771 int port = atoi(digits);
772 if (port > 65535)
773 return PORT_INVALID; // Out of range.
774 return port;
775 }
776
777 template <typename CHAR>
DoExtractFileName(const CHAR * spec,const Component & path,Component * file_name)778 void DoExtractFileName(const CHAR* spec,
779 const Component& path,
780 Component* file_name) {
781 // Handle empty paths: they have no file names.
782 if (path.is_empty()) {
783 file_name->reset();
784 return;
785 }
786
787 // Extract the filename range from the path which is between
788 // the last slash and the following semicolon.
789 int file_end = path.end();
790 for (int i = path.end() - 1; i >= path.begin; i--) {
791 if (spec[i] == ';') {
792 file_end = i;
793 } else if (IsSlashOrBackslash(spec[i])) {
794 // File name is everything following this character to the end
795 *file_name = MakeRange(i + 1, file_end);
796 return;
797 }
798 }
799
800 // No slash found, this means the input was degenerate (generally paths
801 // will start with a slash). Let's call everything the file name.
802 *file_name = MakeRange(path.begin, file_end);
803 return;
804 }
805
806 template <typename CHAR>
DoExtractQueryKeyValue(const CHAR * spec,Component * query,Component * key,Component * value)807 bool DoExtractQueryKeyValue(const CHAR* spec,
808 Component* query,
809 Component* key,
810 Component* value) {
811 if (!query->is_nonempty())
812 return false;
813
814 int start = query->begin;
815 int cur = start;
816 int end = query->end();
817
818 // We assume the beginning of the input is the beginning of the "key" and we
819 // skip to the end of it.
820 key->begin = cur;
821 while (cur < end && spec[cur] != '&' && spec[cur] != '=')
822 cur++;
823 key->len = cur - key->begin;
824
825 // Skip the separator after the key (if any).
826 if (cur < end && spec[cur] == '=')
827 cur++;
828
829 // Find the value part.
830 value->begin = cur;
831 while (cur < end && spec[cur] != '&')
832 cur++;
833 value->len = cur - value->begin;
834
835 // Finally skip the next separator if any
836 if (cur < end && spec[cur] == '&')
837 cur++;
838
839 // Save the new query
840 *query = MakeRange(cur, end);
841 return true;
842 }
843
844 } // namespace
845
846 COMPONENT_EXPORT(URL)
847 std::ostream& operator<<(std::ostream& os, const Component& component) {
848 return os << '{' << component.begin << ", " << component.len << "}";
849 }
850
851 Parsed::Parsed() = default;
852
Parsed(const Parsed & other)853 Parsed::Parsed(const Parsed& other)
854 : scheme(other.scheme),
855 username(other.username),
856 password(other.password),
857 host(other.host),
858 port(other.port),
859 path(other.path),
860 query(other.query),
861 ref(other.ref),
862 potentially_dangling_markup(other.potentially_dangling_markup),
863 has_opaque_path(other.has_opaque_path) {
864 if (other.inner_parsed_)
865 set_inner_parsed(*other.inner_parsed_);
866 }
867
operator =(const Parsed & other)868 Parsed& Parsed::operator=(const Parsed& other) {
869 if (this != &other) {
870 scheme = other.scheme;
871 username = other.username;
872 password = other.password;
873 host = other.host;
874 port = other.port;
875 path = other.path;
876 query = other.query;
877 ref = other.ref;
878 potentially_dangling_markup = other.potentially_dangling_markup;
879 has_opaque_path = other.has_opaque_path;
880 if (other.inner_parsed_)
881 set_inner_parsed(*other.inner_parsed_);
882 else
883 clear_inner_parsed();
884 }
885 return *this;
886 }
887
~Parsed()888 Parsed::~Parsed() {
889 delete inner_parsed_;
890 }
891
Length() const892 int Parsed::Length() const {
893 if (ref.is_valid())
894 return ref.end();
895 return CountCharactersBefore(REF, false);
896 }
897
CountCharactersBefore(ComponentType type,bool include_delimiter) const898 int Parsed::CountCharactersBefore(ComponentType type,
899 bool include_delimiter) const {
900 if (type == SCHEME)
901 return scheme.begin;
902
903 // There will be some characters after the scheme like "://" and we don't
904 // know how many. Search forwards for the next thing until we find one.
905 int cur = 0;
906 if (scheme.is_valid())
907 cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme.
908
909 if (username.is_valid()) {
910 if (type <= USERNAME)
911 return username.begin;
912 cur = username.end() + 1; // Advance over the '@' or ':' at the end.
913 }
914
915 if (password.is_valid()) {
916 if (type <= PASSWORD)
917 return password.begin;
918 cur = password.end() + 1; // Advance over the '@' at the end.
919 }
920
921 if (host.is_valid()) {
922 if (type <= HOST)
923 return host.begin;
924 cur = host.end();
925 }
926
927 if (port.is_valid()) {
928 if (type < PORT || (type == PORT && include_delimiter))
929 return port.begin - 1; // Back over delimiter.
930 if (type == PORT)
931 return port.begin; // Don't want delimiter counted.
932 cur = port.end();
933 }
934
935 if (path.is_valid()) {
936 if (type <= PATH)
937 return path.begin;
938 cur = path.end();
939 }
940
941 if (query.is_valid()) {
942 if (type < QUERY || (type == QUERY && include_delimiter))
943 return query.begin - 1; // Back over delimiter.
944 if (type == QUERY)
945 return query.begin; // Don't want delimiter counted.
946 cur = query.end();
947 }
948
949 if (ref.is_valid()) {
950 if (type == REF && !include_delimiter)
951 return ref.begin; // Back over delimiter.
952
953 // When there is a ref and we get here, the component we wanted was before
954 // this and not found, so we always know the beginning of the ref is right.
955 return ref.begin - 1; // Don't want delimiter counted.
956 }
957
958 return cur;
959 }
960
GetContent() const961 Component Parsed::GetContent() const {
962 const int begin = CountCharactersBefore(USERNAME, false);
963 const int len = Length() - begin;
964 // For compatability with the standard URL parser, we treat no content as
965 // -1, rather than having a length of 0 (we normally wouldn't care so
966 // much for these non-standard URLs).
967 return len ? Component(begin, len) : Component();
968 }
969
ExtractScheme(const char * url,int url_len,Component * scheme)970 bool ExtractScheme(const char* url, int url_len, Component* scheme) {
971 return DoExtractScheme(url, url_len, scheme);
972 }
973
ExtractScheme(const char16_t * url,int url_len,Component * scheme)974 bool ExtractScheme(const char16_t* url, int url_len, Component* scheme) {
975 return DoExtractScheme(url, url_len, scheme);
976 }
977
978 // This handles everything that may be an authority terminator.
979 //
980 // URL Standard:
981 // https://url.spec.whatwg.org/#authority-state
982 // >> 2. Otherwise, if one of the following is true:
983 // >> - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
984 // >> - url is special and c is U+005C (\)
IsAuthorityTerminator(char16_t ch,ParserMode parser_mode)985 bool IsAuthorityTerminator(char16_t ch, ParserMode parser_mode) {
986 if (parser_mode == ParserMode::kSpecialURL) {
987 return IsSlashOrBackslash(ch) || ch == '?' || ch == '#';
988 }
989 return ch == '/' || ch == '?' || ch == '#';
990 }
991
ExtractFileName(const char * url,const Component & path,Component * file_name)992 void ExtractFileName(const char* url,
993 const Component& path,
994 Component* file_name) {
995 DoExtractFileName(url, path, file_name);
996 }
997
ExtractFileName(const char16_t * url,const Component & path,Component * file_name)998 void ExtractFileName(const char16_t* url,
999 const Component& path,
1000 Component* file_name) {
1001 DoExtractFileName(url, path, file_name);
1002 }
1003
ExtractQueryKeyValue(const char * url,Component * query,Component * key,Component * value)1004 bool ExtractQueryKeyValue(const char* url,
1005 Component* query,
1006 Component* key,
1007 Component* value) {
1008 return DoExtractQueryKeyValue(url, query, key, value);
1009 }
1010
ExtractQueryKeyValue(const char16_t * url,Component * query,Component * key,Component * value)1011 bool ExtractQueryKeyValue(const char16_t* url,
1012 Component* query,
1013 Component* key,
1014 Component* value) {
1015 return DoExtractQueryKeyValue(url, query, key, value);
1016 }
1017
ParseAuthority(const char * spec,const Component & auth,Component * username,Component * password,Component * hostname,Component * port_num)1018 void ParseAuthority(const char* spec,
1019 const Component& auth,
1020 Component* username,
1021 Component* password,
1022 Component* hostname,
1023 Component* port_num) {
1024 DoParseAuthority(spec, auth, ParserMode::kSpecialURL, username, password,
1025 hostname, port_num);
1026 }
1027
ParseAuthority(const char16_t * spec,const Component & auth,Component * username,Component * password,Component * hostname,Component * port_num)1028 void ParseAuthority(const char16_t* spec,
1029 const Component& auth,
1030 Component* username,
1031 Component* password,
1032 Component* hostname,
1033 Component* port_num) {
1034 DoParseAuthority(spec, auth, ParserMode::kSpecialURL, username, password,
1035 hostname, port_num);
1036 }
1037
ParseAuthority(const char * spec,const Component & auth,ParserMode parser_mode,Component * username,Component * password,Component * hostname,Component * port_num)1038 void ParseAuthority(const char* spec,
1039 const Component& auth,
1040 ParserMode parser_mode,
1041 Component* username,
1042 Component* password,
1043 Component* hostname,
1044 Component* port_num) {
1045 DoParseAuthority(spec, auth, parser_mode, username, password, hostname,
1046 port_num);
1047 }
1048
ParseAuthority(const char16_t * spec,const Component & auth,ParserMode parser_mode,Component * username,Component * password,Component * hostname,Component * port_num)1049 void ParseAuthority(const char16_t* spec,
1050 const Component& auth,
1051 ParserMode parser_mode,
1052 Component* username,
1053 Component* password,
1054 Component* hostname,
1055 Component* port_num) {
1056 DoParseAuthority(spec, auth, parser_mode, username, password, hostname,
1057 port_num);
1058 }
1059
ParsePort(const char * url,const Component & port)1060 int ParsePort(const char* url, const Component& port) {
1061 return DoParsePort(url, port);
1062 }
1063
ParsePort(const char16_t * url,const Component & port)1064 int ParsePort(const char16_t* url, const Component& port) {
1065 return DoParsePort(url, port);
1066 }
1067
ParseStandardURL(const char * url,int url_len,Parsed * parsed)1068 void ParseStandardURL(const char* url, int url_len, Parsed* parsed) {
1069 DoParseStandardURL(url, url_len, parsed);
1070 }
1071
ParseStandardURL(const char16_t * url,int url_len,Parsed * parsed)1072 void ParseStandardURL(const char16_t* url, int url_len, Parsed* parsed) {
1073 DoParseStandardURL(url, url_len, parsed);
1074 }
1075
ParseNonSpecialURL(const char * url,int url_len,Parsed * parsed)1076 void ParseNonSpecialURL(const char* url, int url_len, Parsed* parsed) {
1077 DoParseNonSpecialURL(url, url_len, parsed);
1078 }
1079
ParseNonSpecialURL(const char16_t * url,int url_len,Parsed * parsed)1080 void ParseNonSpecialURL(const char16_t* url, int url_len, Parsed* parsed) {
1081 DoParseNonSpecialURL(url, url_len, parsed);
1082 }
1083
ParsePathURL(const char * url,int url_len,bool trim_path_end,Parsed * parsed)1084 void ParsePathURL(const char* url,
1085 int url_len,
1086 bool trim_path_end,
1087 Parsed* parsed) {
1088 DoParsePathURL(url, url_len, trim_path_end, parsed);
1089 }
1090
ParsePathURL(const char16_t * url,int url_len,bool trim_path_end,Parsed * parsed)1091 void ParsePathURL(const char16_t* url,
1092 int url_len,
1093 bool trim_path_end,
1094 Parsed* parsed) {
1095 DoParsePathURL(url, url_len, trim_path_end, parsed);
1096 }
1097
ParseFileSystemURL(const char * url,int url_len,Parsed * parsed)1098 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
1099 DoParseFileSystemURL(url, url_len, parsed);
1100 }
1101
ParseFileSystemURL(const char16_t * url,int url_len,Parsed * parsed)1102 void ParseFileSystemURL(const char16_t* url, int url_len, Parsed* parsed) {
1103 DoParseFileSystemURL(url, url_len, parsed);
1104 }
1105
ParseMailtoURL(const char * url,int url_len,Parsed * parsed)1106 void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
1107 DoParseMailtoURL(url, url_len, parsed);
1108 }
1109
ParseMailtoURL(const char16_t * url,int url_len,Parsed * parsed)1110 void ParseMailtoURL(const char16_t* url, int url_len, Parsed* parsed) {
1111 DoParseMailtoURL(url, url_len, parsed);
1112 }
1113
ParsePathInternal(const char * spec,const Component & path,Component * filepath,Component * query,Component * ref)1114 void ParsePathInternal(const char* spec,
1115 const Component& path,
1116 Component* filepath,
1117 Component* query,
1118 Component* ref) {
1119 ParsePath(spec, path, filepath, query, ref);
1120 }
1121
ParsePathInternal(const char16_t * spec,const Component & path,Component * filepath,Component * query,Component * ref)1122 void ParsePathInternal(const char16_t* spec,
1123 const Component& path,
1124 Component* filepath,
1125 Component* query,
1126 Component* ref) {
1127 ParsePath(spec, path, filepath, query, ref);
1128 }
1129
ParseAfterSpecialScheme(const char * spec,int spec_len,int after_scheme,Parsed * parsed)1130 void ParseAfterSpecialScheme(const char* spec,
1131 int spec_len,
1132 int after_scheme,
1133 Parsed* parsed) {
1134 DoParseAfterSpecialScheme(spec, spec_len, after_scheme, parsed);
1135 }
1136
ParseAfterSpecialScheme(const char16_t * spec,int spec_len,int after_scheme,Parsed * parsed)1137 void ParseAfterSpecialScheme(const char16_t* spec,
1138 int spec_len,
1139 int after_scheme,
1140 Parsed* parsed) {
1141 DoParseAfterSpecialScheme(spec, spec_len, after_scheme, parsed);
1142 }
1143
1144 } // namespace url
1145