• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Based on nsURLParsers.cc from Mozilla
2  * -------------------------------------
3  * The contents of this file are subject to the Mozilla Public License Version
4  * 1.1 (the "License"); you may not use this file except in compliance with
5  * the License. You may obtain a copy of the License at
6  * http://www.mozilla.org/MPL/
7  *
8  * Software distributed under the License is distributed on an "AS IS" basis,
9  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10  * for the specific language governing rights and limitations under the
11  * License.
12  *
13  * The Original Code is mozilla.org code.
14  *
15  * The Initial Developer of the Original Code is
16  * Netscape Communications Corporation.
17  * Portions created by the Initial Developer are Copyright (C) 1998
18  * the Initial Developer. All Rights Reserved.
19  *
20  * Contributor(s):
21  *   Darin Fisher (original author)
22  *
23  * Alternatively, the contents of this file may be used under the terms of
24  * either the GNU General Public License Version 2 or later (the "GPL"), or
25  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26  * in which case the provisions of the GPL or the LGPL are applicable instead
27  * of those above. If you wish to allow use of your version of this file only
28  * under the terms of either the GPL or the LGPL, and not to allow others to
29  * use your version of this file under the terms of the MPL, indicate your
30  * decision by deleting the provisions above and replace them with the notice
31  * and other provisions required by the GPL or the LGPL. If you do not delete
32  * the provisions above, a recipient may use your version of this file under
33  * the terms of any one of the MPL, the GPL or the LGPL.
34  *
35  * ***** END LICENSE BLOCK ***** */
36 
37 #include "url/third_party/mozilla/url_parse.h"
38 
39 #include <stdlib.h>
40 
41 #include <ostream>
42 
43 #include "base/check_op.h"
44 #include "url/url_parse_internal.h"
45 #include "url/url_util.h"
46 #include "url/url_util_internal.h"
47 
48 namespace url {
49 
operator <<(std::ostream & os,const Parsed & parsed)50 std::ostream& operator<<(std::ostream& os, const Parsed& parsed) {
51   return os << "{ scheme: " << parsed.scheme
52             << ", username: " << parsed.username
53             << ", password: " << parsed.password << ", host: " << parsed.host
54             << ", port: " << parsed.port << ", path: " << parsed.path
55             << ", query: " << parsed.query << ", ref: " << parsed.ref
56             << ", has_opaque_path: " << parsed.has_opaque_path << " }";
57 }
58 
59 namespace {
60 
61 // Returns true if the given character is a valid digit to use in a port.
IsPortDigit(char16_t ch)62 inline bool IsPortDigit(char16_t ch) {
63   return ch >= '0' && ch <= '9';
64 }
65 
66 // Returns the offset of the next authority terminator in the input starting
67 // from start_offset. If no terminator is found, the return value will be equal
68 // to spec_len.
69 template <typename CHAR>
FindNextAuthorityTerminator(const CHAR * spec,int start_offset,int spec_len,ParserMode parser_mode)70 int FindNextAuthorityTerminator(const CHAR* spec,
71                                 int start_offset,
72                                 int spec_len,
73                                 ParserMode parser_mode) {
74   for (int i = start_offset; i < spec_len; i++) {
75     if (IsAuthorityTerminator(spec[i], parser_mode)) {
76       return i;
77     }
78   }
79   return spec_len;  // Not found.
80 }
81 
82 template <typename CHAR>
ParseUserInfo(const CHAR * spec,const Component & user,Component * username,Component * password)83 void ParseUserInfo(const CHAR* spec,
84                    const Component& user,
85                    Component* username,
86                    Component* password) {
87   // Find the first colon in the user section, which separates the username and
88   // password.
89   int colon_offset = 0;
90   while (colon_offset < user.len && spec[user.begin + colon_offset] != ':')
91     colon_offset++;
92 
93   if (colon_offset < user.len) {
94     // Found separator: <username>:<password>
95     *username = Component(user.begin, colon_offset);
96     *password = MakeRange(user.begin + colon_offset + 1, user.begin + user.len);
97   } else {
98     // No separator, treat everything as the username
99     *username = user;
100     *password = Component();
101   }
102 }
103 
104 template <typename CHAR>
ParseServerInfo(const CHAR * spec,const Component & serverinfo,Component * hostname,Component * port_num)105 void ParseServerInfo(const CHAR* spec,
106                      const Component& serverinfo,
107                      Component* hostname,
108                      Component* port_num) {
109   if (serverinfo.len == 0) {
110     // No server info, host name is empty.
111     hostname->reset();
112     port_num->reset();
113     return;
114   }
115 
116   // If the host starts with a left-bracket, assume the entire host is an
117   // IPv6 literal.  Otherwise, assume none of the host is an IPv6 literal.
118   // This assumption will be overridden if we find a right-bracket.
119   //
120   // Our IPv6 address canonicalization code requires both brackets to exist,
121   // but the ability to locate an incomplete address can still be useful.
122   int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
123   int colon = -1;
124 
125   // Find the last right-bracket, and the last colon.
126   for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
127     switch (spec[i]) {
128       case ']':
129         ipv6_terminator = i;
130         break;
131       case ':':
132         colon = i;
133         break;
134     }
135   }
136 
137   if (colon > ipv6_terminator) {
138     // Found a port number: <hostname>:<port>
139     *hostname = MakeRange(serverinfo.begin, colon);
140     if (hostname->len == 0)
141       hostname->reset();
142     *port_num = MakeRange(colon + 1, serverinfo.end());
143   } else {
144     // No port: <hostname>
145     *hostname = serverinfo;
146     port_num->reset();
147   }
148 }
149 
150 // Given an already-identified auth section, breaks it into its consituent
151 // parts. The port number will be parsed and the resulting integer will be
152 // filled into the given *port variable, or -1 if there is no port number or it
153 // is invalid.
154 template <typename CHAR>
DoParseAuthority(const CHAR * spec,const Component & auth,ParserMode parser_mode,Component * username,Component * password,Component * hostname,Component * port_num)155 void DoParseAuthority(const CHAR* spec,
156                       const Component& auth,
157                       ParserMode parser_mode,
158                       Component* username,
159                       Component* password,
160                       Component* hostname,
161                       Component* port_num) {
162   DCHECK(auth.is_valid()) << "We should always get an authority";
163   if (auth.len == 0) {
164     username->reset();
165     password->reset();
166     if (parser_mode == ParserMode::kSpecialURL) {
167       hostname->reset();
168     } else {
169       // Non-special URLs can have an empty host. The difference between "host
170       // is empty" and "host does not exist" matters in the canonicalization
171       // phase.
172       //
173       // Examples:
174       // - "git:///" => host is empty (this case).
175       // - "git:/" => host does not exist.
176       *hostname = Component(auth.begin, 0);
177     }
178     port_num->reset();
179     return;
180   }
181 
182   // Search backwards for @, which is the separator between the user info and
183   // the server info.
184   int i = auth.begin + auth.len - 1;
185   while (i > auth.begin && spec[i] != '@')
186     i--;
187 
188   if (spec[i] == '@') {
189     // Found user info: <user-info>@<server-info>
190     ParseUserInfo(spec, Component(auth.begin, i - auth.begin), username,
191                   password);
192     ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), hostname,
193                     port_num);
194   } else {
195     // No user info, everything is server info.
196     username->reset();
197     password->reset();
198     ParseServerInfo(spec, auth, hostname, port_num);
199   }
200 }
201 
202 template <typename CHAR>
FindQueryAndRefParts(const CHAR * spec,const Component & path,int * query_separator,int * ref_separator)203 inline void FindQueryAndRefParts(const CHAR* spec,
204                                  const Component& path,
205                                  int* query_separator,
206                                  int* ref_separator) {
207   if constexpr (sizeof(*spec) == 1) {
208     // memchr is much faster than any scalar code we can write.
209     const CHAR* ptr = spec + path.begin;
210     const CHAR* first_hash =
211         reinterpret_cast<const CHAR*>(memchr(ptr, '#', path.len));
212     size_t len_before_fragment =
213         first_hash == nullptr ? path.len : first_hash - ptr;
214     const CHAR* first_question =
215         reinterpret_cast<const CHAR*>(memchr(ptr, '?', len_before_fragment));
216     if (first_hash != nullptr) {
217       *ref_separator = first_hash - spec;
218     }
219     if (first_question != nullptr) {
220       *query_separator = first_question - spec;
221     }
222   } else {
223     int path_end = path.begin + path.len;
224     for (int i = path.begin; i < path_end; i++) {
225       switch (spec[i]) {
226         case '?':
227           // Only match the query string if it precedes the reference fragment
228           // and when we haven't found one already.
229           if (*query_separator < 0)
230             *query_separator = i;
231           break;
232         case '#':
233           // Record the first # sign only.
234           if (*ref_separator < 0) {
235             *ref_separator = i;
236             return;
237           }
238           break;
239       }
240     }
241   }
242 }
243 
244 template <typename CHAR>
ParsePath(const CHAR * spec,const Component & path,Component * filepath,Component * query,Component * ref)245 void ParsePath(const CHAR* spec,
246                const Component& path,
247                Component* filepath,
248                Component* query,
249                Component* ref) {
250   // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref>
251   DCHECK(path.is_valid());
252 
253   // Search for first occurrence of either ? or #.
254   int query_separator = -1;  // Index of the '?'
255   int ref_separator = -1;    // Index of the '#'
256   FindQueryAndRefParts(spec, path, &query_separator, &ref_separator);
257 
258   // Markers pointing to the character after each of these corresponding
259   // components. The code below words from the end back to the beginning,
260   // and will update these indices as it finds components that exist.
261   int file_end, query_end;
262 
263   // Ref fragment: from the # to the end of the path.
264   int path_end = path.begin + path.len;
265   if (ref_separator >= 0) {
266     file_end = query_end = ref_separator;
267     *ref = MakeRange(ref_separator + 1, path_end);
268   } else {
269     file_end = query_end = path_end;
270     ref->reset();
271   }
272 
273   // Query fragment: everything from the ? to the next boundary (either the end
274   // of the path or the ref fragment).
275   if (query_separator >= 0) {
276     file_end = query_separator;
277     *query = MakeRange(query_separator + 1, query_end);
278   } else {
279     query->reset();
280   }
281 
282   if (file_end != path.begin) {
283     *filepath = MakeRange(path.begin, file_end);
284   } else {
285     // File path: treat an empty file path as no file path.
286     //
287     // TODO(crbug.com/1416006): Consider to assign zero-length path component
288     // for non-special URLs because a path can be empty in non-special URLs.
289     // Currently, we don't have to distinguish between them. There is no visible
290     // difference.
291     filepath->reset();
292   }
293 }
294 
295 template <typename CHAR>
DoExtractScheme(const CHAR * url,int url_len,Component * scheme)296 bool DoExtractScheme(const CHAR* url, int url_len, Component* scheme) {
297   // Skip leading whitespace and control characters.
298   int begin = 0;
299   while (begin < url_len && ShouldTrimFromURL(url[begin]))
300     begin++;
301   if (begin == url_len)
302     return false;  // Input is empty or all whitespace.
303 
304   // Find the first colon character.
305   for (int i = begin; i < url_len; i++) {
306     if (url[i] == ':') {
307       *scheme = MakeRange(begin, i);
308       return true;
309     }
310   }
311   return false;  // No colon found: no scheme
312 }
313 
314 // Fills in all members of the Parsed structure except for the scheme.
315 //
316 // |spec| is the full spec being parsed, of length |spec_len|.
317 // |after_scheme| is the character immediately following the scheme (after the
318 //   colon) where we'll begin parsing.
319 //
320 // Compatability data points. I list "host", "path" extracted:
321 // Input                IE6             Firefox                Us
322 // -----                --------------  --------------         --------------
323 // http://foo.com/      "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
324 // http:foo.com/        "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
325 // http:/foo.com/       fail(*)         "foo.com", "/"         "foo.com", "/"
326 // http:\foo.com/       fail(*)         "\foo.com", "/"(fail)  "foo.com", "/"
327 // http:////foo.com/    "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
328 //
329 // (*) Interestingly, although IE fails to load these URLs, its history
330 // canonicalizer handles them, meaning if you've been to the corresponding
331 // "http://foo.com/" link, it will be colored.
332 template <typename CHAR>
DoParseAfterSpecialScheme(const CHAR * spec,int spec_len,int after_scheme,Parsed * parsed)333 void DoParseAfterSpecialScheme(const CHAR* spec,
334                                int spec_len,
335                                int after_scheme,
336                                Parsed* parsed) {
337   int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
338   int after_slashes = after_scheme + num_slashes;
339 
340   // First split into two main parts, the authority (username, password, host,
341   // and port) and the full path (path, query, and reference).
342   //
343   // Treat everything from `after_slashes` to the next slash (or end of spec) to
344   // be the authority. Note that we ignore the number of slashes and treat it as
345   // the authority.
346   int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len,
347                                              ParserMode::kSpecialURL);
348 
349   Component authority(after_slashes, end_auth - after_slashes);
350   // Everything starting from the slash to the end is the path.
351   Component full_path(end_auth, spec_len - end_auth);
352 
353   // Now parse those two sub-parts.
354   DoParseAuthority(spec, authority, ParserMode::kSpecialURL, &parsed->username,
355                    &parsed->password, &parsed->host, &parsed->port);
356   ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
357 }
358 
359 // The main parsing function for standard URLs. Standard URLs have a scheme,
360 // host, path, etc.
361 template <typename CHAR>
DoParseStandardURL(const CHAR * spec,int spec_len,Parsed * parsed)362 void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
363   DCHECK(spec_len >= 0);
364   parsed->has_opaque_path = false;
365 
366   // Strip leading & trailing spaces and control characters.
367   int begin = 0;
368   TrimURL(spec, &begin, &spec_len);
369 
370   int after_scheme;
371   if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
372     after_scheme = parsed->scheme.end() + 1;  // Skip past the colon.
373   } else {
374     // Say there's no scheme when there is no colon. We could also say that
375     // everything is the scheme. Both would produce an invalid URL, but this way
376     // seems less wrong in more cases.
377     parsed->scheme.reset();
378     after_scheme = begin;
379   }
380   DoParseAfterSpecialScheme(spec, spec_len, after_scheme, parsed);
381 }
382 
383 template <typename CHAR>
DoParseAfterNonSpecialScheme(const CHAR * spec,int spec_len,int after_scheme,Parsed * parsed)384 void DoParseAfterNonSpecialScheme(const CHAR* spec,
385                                   int spec_len,
386                                   int after_scheme,
387                                   Parsed* parsed) {
388   // The implementation is similar to `DoParseAfterSpecialScheme()`, but there
389   // are many subtle differences. So we have a different function for parsing
390   // non-special URLs.
391 
392   int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
393 
394   if (num_slashes >= 2) {
395     // Found "//<some data>", looks like an authority section.
396     //
397     // e.g.
398     //   "git://host:8000/path"
399     //          ^
400     //
401     // The state machine transition in the URL Standard is:
402     //
403     // https://url.spec.whatwg.org/#scheme-state
404     // => https://url.spec.whatwg.org/#path-or-authority-state
405     // => https://url.spec.whatwg.org/#authority-state
406     //
407     parsed->has_opaque_path = false;
408 
409     int after_slashes = after_scheme + 2;
410 
411     // First split into two main parts, the authority (username, password, host,
412     // and port) and the full path (path, query, and reference).
413     //
414     // Treat everything from there to the next slash (or end of spec) to be the
415     // authority. Note that we ignore the number of slashes and treat it as the
416     // authority.
417     int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len,
418                                                ParserMode::kNonSpecialURL);
419     Component authority(after_slashes, end_auth - after_slashes);
420 
421     // Now parse those two sub-parts.
422     DoParseAuthority(spec, authority, ParserMode::kNonSpecialURL,
423                      &parsed->username, &parsed->password, &parsed->host,
424                      &parsed->port);
425 
426     // Everything starting from the slash to the end is the path.
427     Component full_path(end_auth, spec_len - end_auth);
428     ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
429     return;
430   }
431 
432   if (num_slashes == 1) {
433     // Examples:
434     //   "git:/path"
435     //        ^
436     //
437     // The state machine transition in the URL Standard is:
438     //
439     // https://url.spec.whatwg.org/#scheme-state
440     // => https://url.spec.whatwg.org/#path-or-authority-state
441     // => https://url.spec.whatwg.org/#path-state
442     parsed->has_opaque_path = false;
443   } else {
444     // We didn't found "//" nor "/", so entering into an opaque-path-state.
445     //
446     // Examples:
447     //   "git:opaque path"
448     //        ^
449     //
450     // The state machine transition in the URL Standard is:
451     //
452     // https://url.spec.whatwg.org/#scheme-state
453     // => https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
454     parsed->has_opaque_path = true;
455   }
456 
457   parsed->username.reset();
458   parsed->password.reset();
459   // It's important to reset `parsed->host` here to distinguish between "host
460   // is empty" and "host doesn't exist".
461   parsed->host.reset();
462   parsed->port.reset();
463 
464   // Everything starting after scheme to the end is the path.
465   Component full_path(after_scheme, spec_len - after_scheme);
466   ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
467 }
468 
469 // The main parsing function for non-special scheme URLs.
470 template <typename CHAR>
DoParseNonSpecialURL(const CHAR * spec,int spec_len,Parsed * parsed)471 void DoParseNonSpecialURL(const CHAR* spec, int spec_len, Parsed* parsed) {
472   DCHECK(spec_len >= 0);
473 
474   // Strip leading & trailing spaces and control characters.
475   int begin = 0;
476   TrimURL(spec, &begin, &spec_len);
477 
478   int after_scheme;
479   if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
480     after_scheme = parsed->scheme.end() + 1;  // Skip past the colon.
481   } else {
482     // Say there's no scheme when there is no colon. We could also say that
483     // everything is the scheme. Both would produce an invalid URL, but this way
484     // seems less wrong in more cases.
485     parsed->scheme.reset();
486     after_scheme = 0;
487   }
488   DoParseAfterNonSpecialScheme(spec, spec_len, after_scheme, parsed);
489 }
490 
491 template <typename CHAR>
DoParseFileSystemURL(const CHAR * spec,int spec_len,Parsed * parsed)492 void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
493   DCHECK(spec_len >= 0);
494 
495   // Get the unused parts of the URL out of the way.
496   parsed->username.reset();
497   parsed->password.reset();
498   parsed->host.reset();
499   parsed->port.reset();
500   parsed->path.reset();          // May use this; reset for convenience.
501   parsed->ref.reset();           // May use this; reset for convenience.
502   parsed->query.reset();         // May use this; reset for convenience.
503   parsed->clear_inner_parsed();  // May use this; reset for convenience.
504   parsed->has_opaque_path = false;
505 
506   // Strip leading & trailing spaces and control characters.
507   int begin = 0;
508   TrimURL(spec, &begin, &spec_len);
509 
510   // Handle empty specs or ones that contain only whitespace or control chars.
511   if (begin == spec_len) {
512     parsed->scheme.reset();
513     return;
514   }
515 
516   int inner_start = -1;
517 
518   // Extract the scheme.  We also handle the case where there is no scheme.
519   if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
520     // Offset the results since we gave ExtractScheme a substring.
521     parsed->scheme.begin += begin;
522 
523     if (parsed->scheme.end() == spec_len - 1)
524       return;
525 
526     inner_start = parsed->scheme.end() + 1;
527   } else {
528     // No scheme found; that's not valid for filesystem URLs.
529     parsed->scheme.reset();
530     return;
531   }
532 
533   Component inner_scheme;
534   const CHAR* inner_spec = &spec[inner_start];
535   int inner_spec_len = spec_len - inner_start;
536 
537   if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) {
538     // Offset the results since we gave ExtractScheme a substring.
539     inner_scheme.begin += inner_start;
540 
541     if (inner_scheme.end() == spec_len - 1)
542       return;
543   } else {
544     // No scheme found; that's not valid for filesystem URLs.
545     // The best we can do is return "filesystem://".
546     return;
547   }
548 
549   Parsed inner_parsed;
550 
551   if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) {
552     // File URLs are special.
553     ParseFileURL(inner_spec, inner_spec_len, &inner_parsed);
554   } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) {
555     // Filesystem URLs don't nest.
556     return;
557   } else if (IsStandard(spec, inner_scheme)) {
558     // All "normal" URLs.
559     DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed);
560   } else {
561     return;
562   }
563 
564   // All members of inner_parsed need to be offset by inner_start.
565   // If we had any scheme that supported nesting more than one level deep,
566   // we'd have to recurse into the inner_parsed's inner_parsed when
567   // adjusting by inner_start.
568   inner_parsed.scheme.begin += inner_start;
569   inner_parsed.username.begin += inner_start;
570   inner_parsed.password.begin += inner_start;
571   inner_parsed.host.begin += inner_start;
572   inner_parsed.port.begin += inner_start;
573   inner_parsed.query.begin += inner_start;
574   inner_parsed.ref.begin += inner_start;
575   inner_parsed.path.begin += inner_start;
576 
577   // Query and ref move from inner_parsed to parsed.
578   parsed->query = inner_parsed.query;
579   inner_parsed.query.reset();
580   parsed->ref = inner_parsed.ref;
581   inner_parsed.ref.reset();
582 
583   parsed->set_inner_parsed(inner_parsed);
584   if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() ||
585       inner_parsed.inner_parsed()) {
586     return;
587   }
588 
589   // The path in inner_parsed should start with a slash, then have a filesystem
590   // type followed by a slash.  From the first slash up to but excluding the
591   // second should be what it keeps; the rest goes to parsed.  If the path ends
592   // before the second slash, it's still pretty clear what the user meant, so
593   // we'll let that through.
594   if (!IsSlashOrBackslash(spec[inner_parsed.path.begin])) {
595     return;
596   }
597   int inner_path_end = inner_parsed.path.begin + 1;  // skip the leading slash
598   while (inner_path_end < spec_len &&
599          !IsSlashOrBackslash(spec[inner_path_end])) {
600     ++inner_path_end;
601   }
602   parsed->path.begin = inner_path_end;
603   int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
604   parsed->path.len = inner_parsed.path.len - new_inner_path_length;
605   parsed->inner_parsed()->path.len = new_inner_path_length;
606 }
607 
608 // Initializes a path URL which is merely a scheme followed by a path. Examples
609 // include "about:foo" and "javascript:alert('bar');"
610 template <typename CHAR>
DoParsePathURL(const CHAR * spec,int spec_len,bool trim_path_end,Parsed * parsed)611 void DoParsePathURL(const CHAR* spec,
612                     int spec_len,
613                     bool trim_path_end,
614                     Parsed* parsed) {
615   // Get the non-path and non-scheme parts of the URL out of the way, we never
616   // use them.
617   parsed->username.reset();
618   parsed->password.reset();
619   parsed->host.reset();
620   parsed->port.reset();
621   parsed->path.reset();
622   parsed->query.reset();
623   parsed->ref.reset();
624   // In practice, we don't need to set `has_opaque_path` here because:
625   //
626   // 1. `has_opaque_path` will be used only when the
627   //     `kStandardCompliantNonSpecialSchemeURLParsing` feature is enabled.
628   // 2. `DoParsePathURL` will not be used when the flag is enabled (planned).
629   //
630   // However, for predictable results, it is better to explicitly set it
631   // `false`.
632   parsed->has_opaque_path = false;
633 
634   // Strip leading & trailing spaces and control characters.
635   int scheme_begin = 0;
636   TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
637 
638   // Handle empty specs or ones that contain only whitespace or control chars.
639   if (scheme_begin == spec_len) {
640     parsed->scheme.reset();
641     parsed->path.reset();
642     return;
643   }
644 
645   int path_begin;
646   // Extract the scheme, with the path being everything following. We also
647   // handle the case where there is no scheme.
648   if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
649                     &parsed->scheme)) {
650     // Offset the results since we gave ExtractScheme a substring.
651     parsed->scheme.begin += scheme_begin;
652     path_begin = parsed->scheme.end() + 1;
653   } else {
654     // No scheme case.
655     parsed->scheme.reset();
656     path_begin = scheme_begin;
657   }
658 
659   if (path_begin == spec_len)
660     return;
661   DCHECK_LT(path_begin, spec_len);
662 
663   ParsePath(spec, MakeRange(path_begin, spec_len), &parsed->path,
664             &parsed->query, &parsed->ref);
665 }
666 
667 template <typename CHAR>
DoParseMailtoURL(const CHAR * spec,int spec_len,Parsed * parsed)668 void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
669   DCHECK(spec_len >= 0);
670 
671   // Get the non-path and non-scheme parts of the URL out of the way, we never
672   // use them.
673   parsed->username.reset();
674   parsed->password.reset();
675   parsed->host.reset();
676   parsed->port.reset();
677   parsed->ref.reset();
678   parsed->query.reset();  // May use this; reset for convenience.
679   parsed->has_opaque_path = false;
680 
681   // Strip leading & trailing spaces and control characters.
682   int begin = 0;
683   TrimURL(spec, &begin, &spec_len);
684 
685   // Handle empty specs or ones that contain only whitespace or control chars.
686   if (begin == spec_len) {
687     parsed->scheme.reset();
688     parsed->path.reset();
689     return;
690   }
691 
692   int path_begin = -1;
693   int path_end = -1;
694 
695   // Extract the scheme, with the path being everything following. We also
696   // handle the case where there is no scheme.
697   if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
698     // Offset the results since we gave ExtractScheme a substring.
699     parsed->scheme.begin += begin;
700 
701     if (parsed->scheme.end() != spec_len - 1) {
702       path_begin = parsed->scheme.end() + 1;
703       path_end = spec_len;
704     }
705   } else {
706     // No scheme found, just path.
707     parsed->scheme.reset();
708     path_begin = begin;
709     path_end = spec_len;
710   }
711 
712   // Split [path_begin, path_end) into a path + query.
713   for (int i = path_begin; i < path_end; ++i) {
714     if (spec[i] == '?') {
715       parsed->query = MakeRange(i + 1, path_end);
716       path_end = i;
717       break;
718     }
719   }
720 
721   // For compatability with the standard URL parser, treat no path as
722   // -1, rather than having a length of 0
723   if (path_begin == path_end) {
724     parsed->path.reset();
725   } else {
726     parsed->path = MakeRange(path_begin, path_end);
727   }
728 }
729 
730 // Converts a port number in a string to an integer. We'd like to just call
731 // sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
732 // we copy the digits to a small stack buffer (since we know the maximum number
733 // of digits in a valid port number) that we can NULL terminate.
734 template <typename CHAR>
DoParsePort(const CHAR * spec,const Component & component)735 int DoParsePort(const CHAR* spec, const Component& component) {
736   // Easy success case when there is no port.
737   const int kMaxDigits = 5;
738   if (component.is_empty())
739     return PORT_UNSPECIFIED;
740 
741   // Skip over any leading 0s.
742   Component digits_comp(component.end(), 0);
743   for (int i = 0; i < component.len; i++) {
744     if (spec[component.begin + i] != '0') {
745       digits_comp = MakeRange(component.begin + i, component.end());
746       break;
747     }
748   }
749   if (digits_comp.len == 0)
750     return 0;  // All digits were 0.
751 
752   // Verify we don't have too many digits (we'll be copying to our buffer so
753   // we need to double-check).
754   if (digits_comp.len > kMaxDigits)
755     return PORT_INVALID;
756 
757   // Copy valid digits to the buffer.
758   char digits[kMaxDigits + 1];  // +1 for null terminator
759   for (int i = 0; i < digits_comp.len; i++) {
760     CHAR ch = spec[digits_comp.begin + i];
761     if (!IsPortDigit(ch)) {
762       // Invalid port digit, fail.
763       return PORT_INVALID;
764     }
765     digits[i] = static_cast<char>(ch);
766   }
767 
768   // Null-terminate the string and convert to integer. Since we guarantee
769   // only digits, atoi's lack of error handling is OK.
770   digits[digits_comp.len] = 0;
771   int port = atoi(digits);
772   if (port > 65535)
773     return PORT_INVALID;  // Out of range.
774   return port;
775 }
776 
777 template <typename CHAR>
DoExtractFileName(const CHAR * spec,const Component & path,Component * file_name)778 void DoExtractFileName(const CHAR* spec,
779                        const Component& path,
780                        Component* file_name) {
781   // Handle empty paths: they have no file names.
782   if (path.is_empty()) {
783     file_name->reset();
784     return;
785   }
786 
787   // Extract the filename range from the path which is between
788   // the last slash and the following semicolon.
789   int file_end = path.end();
790   for (int i = path.end() - 1; i >= path.begin; i--) {
791     if (spec[i] == ';') {
792       file_end = i;
793     } else if (IsSlashOrBackslash(spec[i])) {
794       // File name is everything following this character to the end
795       *file_name = MakeRange(i + 1, file_end);
796       return;
797     }
798   }
799 
800   // No slash found, this means the input was degenerate (generally paths
801   // will start with a slash). Let's call everything the file name.
802   *file_name = MakeRange(path.begin, file_end);
803   return;
804 }
805 
806 template <typename CHAR>
DoExtractQueryKeyValue(const CHAR * spec,Component * query,Component * key,Component * value)807 bool DoExtractQueryKeyValue(const CHAR* spec,
808                             Component* query,
809                             Component* key,
810                             Component* value) {
811   if (!query->is_nonempty())
812     return false;
813 
814   int start = query->begin;
815   int cur = start;
816   int end = query->end();
817 
818   // We assume the beginning of the input is the beginning of the "key" and we
819   // skip to the end of it.
820   key->begin = cur;
821   while (cur < end && spec[cur] != '&' && spec[cur] != '=')
822     cur++;
823   key->len = cur - key->begin;
824 
825   // Skip the separator after the key (if any).
826   if (cur < end && spec[cur] == '=')
827     cur++;
828 
829   // Find the value part.
830   value->begin = cur;
831   while (cur < end && spec[cur] != '&')
832     cur++;
833   value->len = cur - value->begin;
834 
835   // Finally skip the next separator if any
836   if (cur < end && spec[cur] == '&')
837     cur++;
838 
839   // Save the new query
840   *query = MakeRange(cur, end);
841   return true;
842 }
843 
844 }  // namespace
845 
846 COMPONENT_EXPORT(URL)
847 std::ostream& operator<<(std::ostream& os, const Component& component) {
848   return os << '{' << component.begin << ", " << component.len << "}";
849 }
850 
851 Parsed::Parsed() = default;
852 
Parsed(const Parsed & other)853 Parsed::Parsed(const Parsed& other)
854     : scheme(other.scheme),
855       username(other.username),
856       password(other.password),
857       host(other.host),
858       port(other.port),
859       path(other.path),
860       query(other.query),
861       ref(other.ref),
862       potentially_dangling_markup(other.potentially_dangling_markup),
863       has_opaque_path(other.has_opaque_path) {
864   if (other.inner_parsed_)
865     set_inner_parsed(*other.inner_parsed_);
866 }
867 
operator =(const Parsed & other)868 Parsed& Parsed::operator=(const Parsed& other) {
869   if (this != &other) {
870     scheme = other.scheme;
871     username = other.username;
872     password = other.password;
873     host = other.host;
874     port = other.port;
875     path = other.path;
876     query = other.query;
877     ref = other.ref;
878     potentially_dangling_markup = other.potentially_dangling_markup;
879     has_opaque_path = other.has_opaque_path;
880     if (other.inner_parsed_)
881       set_inner_parsed(*other.inner_parsed_);
882     else
883       clear_inner_parsed();
884   }
885   return *this;
886 }
887 
~Parsed()888 Parsed::~Parsed() {
889   delete inner_parsed_;
890 }
891 
Length() const892 int Parsed::Length() const {
893   if (ref.is_valid())
894     return ref.end();
895   return CountCharactersBefore(REF, false);
896 }
897 
CountCharactersBefore(ComponentType type,bool include_delimiter) const898 int Parsed::CountCharactersBefore(ComponentType type,
899                                   bool include_delimiter) const {
900   if (type == SCHEME)
901     return scheme.begin;
902 
903   // There will be some characters after the scheme like "://" and we don't
904   // know how many. Search forwards for the next thing until we find one.
905   int cur = 0;
906   if (scheme.is_valid())
907     cur = scheme.end() + 1;  // Advance over the ':' at the end of the scheme.
908 
909   if (username.is_valid()) {
910     if (type <= USERNAME)
911       return username.begin;
912     cur = username.end() + 1;  // Advance over the '@' or ':' at the end.
913   }
914 
915   if (password.is_valid()) {
916     if (type <= PASSWORD)
917       return password.begin;
918     cur = password.end() + 1;  // Advance over the '@' at the end.
919   }
920 
921   if (host.is_valid()) {
922     if (type <= HOST)
923       return host.begin;
924     cur = host.end();
925   }
926 
927   if (port.is_valid()) {
928     if (type < PORT || (type == PORT && include_delimiter))
929       return port.begin - 1;  // Back over delimiter.
930     if (type == PORT)
931       return port.begin;  // Don't want delimiter counted.
932     cur = port.end();
933   }
934 
935   if (path.is_valid()) {
936     if (type <= PATH)
937       return path.begin;
938     cur = path.end();
939   }
940 
941   if (query.is_valid()) {
942     if (type < QUERY || (type == QUERY && include_delimiter))
943       return query.begin - 1;  // Back over delimiter.
944     if (type == QUERY)
945       return query.begin;  // Don't want delimiter counted.
946     cur = query.end();
947   }
948 
949   if (ref.is_valid()) {
950     if (type == REF && !include_delimiter)
951       return ref.begin;  // Back over delimiter.
952 
953     // When there is a ref and we get here, the component we wanted was before
954     // this and not found, so we always know the beginning of the ref is right.
955     return ref.begin - 1;  // Don't want delimiter counted.
956   }
957 
958   return cur;
959 }
960 
GetContent() const961 Component Parsed::GetContent() const {
962   const int begin = CountCharactersBefore(USERNAME, false);
963   const int len = Length() - begin;
964   // For compatability with the standard URL parser, we treat no content as
965   // -1, rather than having a length of 0 (we normally wouldn't care so
966   // much for these non-standard URLs).
967   return len ? Component(begin, len) : Component();
968 }
969 
ExtractScheme(const char * url,int url_len,Component * scheme)970 bool ExtractScheme(const char* url, int url_len, Component* scheme) {
971   return DoExtractScheme(url, url_len, scheme);
972 }
973 
ExtractScheme(const char16_t * url,int url_len,Component * scheme)974 bool ExtractScheme(const char16_t* url, int url_len, Component* scheme) {
975   return DoExtractScheme(url, url_len, scheme);
976 }
977 
978 // This handles everything that may be an authority terminator.
979 //
980 // URL Standard:
981 // https://url.spec.whatwg.org/#authority-state
982 // >> 2. Otherwise, if one of the following is true:
983 // >>    - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
984 // >>    - url is special and c is U+005C (\)
IsAuthorityTerminator(char16_t ch,ParserMode parser_mode)985 bool IsAuthorityTerminator(char16_t ch, ParserMode parser_mode) {
986   if (parser_mode == ParserMode::kSpecialURL) {
987     return IsSlashOrBackslash(ch) || ch == '?' || ch == '#';
988   }
989   return ch == '/' || ch == '?' || ch == '#';
990 }
991 
ExtractFileName(const char * url,const Component & path,Component * file_name)992 void ExtractFileName(const char* url,
993                      const Component& path,
994                      Component* file_name) {
995   DoExtractFileName(url, path, file_name);
996 }
997 
ExtractFileName(const char16_t * url,const Component & path,Component * file_name)998 void ExtractFileName(const char16_t* url,
999                      const Component& path,
1000                      Component* file_name) {
1001   DoExtractFileName(url, path, file_name);
1002 }
1003 
ExtractQueryKeyValue(const char * url,Component * query,Component * key,Component * value)1004 bool ExtractQueryKeyValue(const char* url,
1005                           Component* query,
1006                           Component* key,
1007                           Component* value) {
1008   return DoExtractQueryKeyValue(url, query, key, value);
1009 }
1010 
ExtractQueryKeyValue(const char16_t * url,Component * query,Component * key,Component * value)1011 bool ExtractQueryKeyValue(const char16_t* url,
1012                           Component* query,
1013                           Component* key,
1014                           Component* value) {
1015   return DoExtractQueryKeyValue(url, query, key, value);
1016 }
1017 
ParseAuthority(const char * spec,const Component & auth,Component * username,Component * password,Component * hostname,Component * port_num)1018 void ParseAuthority(const char* spec,
1019                     const Component& auth,
1020                     Component* username,
1021                     Component* password,
1022                     Component* hostname,
1023                     Component* port_num) {
1024   DoParseAuthority(spec, auth, ParserMode::kSpecialURL, username, password,
1025                    hostname, port_num);
1026 }
1027 
ParseAuthority(const char16_t * spec,const Component & auth,Component * username,Component * password,Component * hostname,Component * port_num)1028 void ParseAuthority(const char16_t* spec,
1029                     const Component& auth,
1030                     Component* username,
1031                     Component* password,
1032                     Component* hostname,
1033                     Component* port_num) {
1034   DoParseAuthority(spec, auth, ParserMode::kSpecialURL, username, password,
1035                    hostname, port_num);
1036 }
1037 
ParseAuthority(const char * spec,const Component & auth,ParserMode parser_mode,Component * username,Component * password,Component * hostname,Component * port_num)1038 void ParseAuthority(const char* spec,
1039                     const Component& auth,
1040                     ParserMode parser_mode,
1041                     Component* username,
1042                     Component* password,
1043                     Component* hostname,
1044                     Component* port_num) {
1045   DoParseAuthority(spec, auth, parser_mode, username, password, hostname,
1046                    port_num);
1047 }
1048 
ParseAuthority(const char16_t * spec,const Component & auth,ParserMode parser_mode,Component * username,Component * password,Component * hostname,Component * port_num)1049 void ParseAuthority(const char16_t* spec,
1050                     const Component& auth,
1051                     ParserMode parser_mode,
1052                     Component* username,
1053                     Component* password,
1054                     Component* hostname,
1055                     Component* port_num) {
1056   DoParseAuthority(spec, auth, parser_mode, username, password, hostname,
1057                    port_num);
1058 }
1059 
ParsePort(const char * url,const Component & port)1060 int ParsePort(const char* url, const Component& port) {
1061   return DoParsePort(url, port);
1062 }
1063 
ParsePort(const char16_t * url,const Component & port)1064 int ParsePort(const char16_t* url, const Component& port) {
1065   return DoParsePort(url, port);
1066 }
1067 
ParseStandardURL(const char * url,int url_len,Parsed * parsed)1068 void ParseStandardURL(const char* url, int url_len, Parsed* parsed) {
1069   DoParseStandardURL(url, url_len, parsed);
1070 }
1071 
ParseStandardURL(const char16_t * url,int url_len,Parsed * parsed)1072 void ParseStandardURL(const char16_t* url, int url_len, Parsed* parsed) {
1073   DoParseStandardURL(url, url_len, parsed);
1074 }
1075 
ParseNonSpecialURL(const char * url,int url_len,Parsed * parsed)1076 void ParseNonSpecialURL(const char* url, int url_len, Parsed* parsed) {
1077   DoParseNonSpecialURL(url, url_len, parsed);
1078 }
1079 
ParseNonSpecialURL(const char16_t * url,int url_len,Parsed * parsed)1080 void ParseNonSpecialURL(const char16_t* url, int url_len, Parsed* parsed) {
1081   DoParseNonSpecialURL(url, url_len, parsed);
1082 }
1083 
ParsePathURL(const char * url,int url_len,bool trim_path_end,Parsed * parsed)1084 void ParsePathURL(const char* url,
1085                   int url_len,
1086                   bool trim_path_end,
1087                   Parsed* parsed) {
1088   DoParsePathURL(url, url_len, trim_path_end, parsed);
1089 }
1090 
ParsePathURL(const char16_t * url,int url_len,bool trim_path_end,Parsed * parsed)1091 void ParsePathURL(const char16_t* url,
1092                   int url_len,
1093                   bool trim_path_end,
1094                   Parsed* parsed) {
1095   DoParsePathURL(url, url_len, trim_path_end, parsed);
1096 }
1097 
ParseFileSystemURL(const char * url,int url_len,Parsed * parsed)1098 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
1099   DoParseFileSystemURL(url, url_len, parsed);
1100 }
1101 
ParseFileSystemURL(const char16_t * url,int url_len,Parsed * parsed)1102 void ParseFileSystemURL(const char16_t* url, int url_len, Parsed* parsed) {
1103   DoParseFileSystemURL(url, url_len, parsed);
1104 }
1105 
ParseMailtoURL(const char * url,int url_len,Parsed * parsed)1106 void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
1107   DoParseMailtoURL(url, url_len, parsed);
1108 }
1109 
ParseMailtoURL(const char16_t * url,int url_len,Parsed * parsed)1110 void ParseMailtoURL(const char16_t* url, int url_len, Parsed* parsed) {
1111   DoParseMailtoURL(url, url_len, parsed);
1112 }
1113 
ParsePathInternal(const char * spec,const Component & path,Component * filepath,Component * query,Component * ref)1114 void ParsePathInternal(const char* spec,
1115                        const Component& path,
1116                        Component* filepath,
1117                        Component* query,
1118                        Component* ref) {
1119   ParsePath(spec, path, filepath, query, ref);
1120 }
1121 
ParsePathInternal(const char16_t * spec,const Component & path,Component * filepath,Component * query,Component * ref)1122 void ParsePathInternal(const char16_t* spec,
1123                        const Component& path,
1124                        Component* filepath,
1125                        Component* query,
1126                        Component* ref) {
1127   ParsePath(spec, path, filepath, query, ref);
1128 }
1129 
ParseAfterSpecialScheme(const char * spec,int spec_len,int after_scheme,Parsed * parsed)1130 void ParseAfterSpecialScheme(const char* spec,
1131                              int spec_len,
1132                              int after_scheme,
1133                              Parsed* parsed) {
1134   DoParseAfterSpecialScheme(spec, spec_len, after_scheme, parsed);
1135 }
1136 
ParseAfterSpecialScheme(const char16_t * spec,int spec_len,int after_scheme,Parsed * parsed)1137 void ParseAfterSpecialScheme(const char16_t* spec,
1138                              int spec_len,
1139                              int after_scheme,
1140                              Parsed* parsed) {
1141   DoParseAfterSpecialScheme(spec, spec_len, after_scheme, parsed);
1142 }
1143 
1144 }  // namespace url
1145