• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013-2016 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 use std::error::Error;
10 use std::fmt::{self, Formatter, Write};
11 use std::str;
12 
13 use crate::host::{Host, HostInternal};
14 use crate::Url;
15 use form_urlencoded::EncodingOverride;
16 use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17 
18 /// https://url.spec.whatwg.org/#fragment-percent-encode-set
19 const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
20 
21 /// https://url.spec.whatwg.org/#path-percent-encode-set
22 const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
23 
24 /// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25 pub(crate) const USERINFO: &AsciiSet = &PATH
26     .add(b'/')
27     .add(b':')
28     .add(b';')
29     .add(b'=')
30     .add(b'@')
31     .add(b'[')
32     .add(b'\\')
33     .add(b']')
34     .add(b'^')
35     .add(b'|');
36 
37 pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
38 
39 // The backslash (\) character is treated as a path separator in special URLs
40 // so it needs to be additionally escaped in that case.
41 pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
42 
43 // https://url.spec.whatwg.org/#query-state
44 const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
45 const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
46 
47 pub type ParseResult<T> = Result<T, ParseError>;
48 
49 macro_rules! simple_enum_error {
50     ($($name: ident => $description: expr,)+) => {
51         /// Errors that can occur during parsing.
52         ///
53         /// This may be extended in the future so exhaustive matching is
54         /// discouraged with an unused variant.
55         #[derive(PartialEq, Eq, Clone, Copy, Debug)]
56         #[non_exhaustive]
57         pub enum ParseError {
58             $(
59                 $name,
60             )+
61         }
62 
63         impl fmt::Display for ParseError {
64             fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
65                 match *self {
66                     $(
67                         ParseError::$name => fmt.write_str($description),
68                     )+
69                 }
70             }
71         }
72     }
73 }
74 
75 impl Error for ParseError {}
76 
77 simple_enum_error! {
78     EmptyHost => "empty host",
79     IdnaError => "invalid international domain name",
80     InvalidPort => "invalid port number",
81     InvalidIpv4Address => "invalid IPv4 address",
82     InvalidIpv6Address => "invalid IPv6 address",
83     InvalidDomainCharacter => "invalid domain character",
84     RelativeUrlWithoutBase => "relative URL without a base",
85     RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
86     SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
87     Overflow => "URLs more than 4 GB are not supported",
88 }
89 
90 impl From<::idna::Errors> for ParseError {
from(_: ::idna::Errors) -> ParseError91     fn from(_: ::idna::Errors) -> ParseError {
92         ParseError::IdnaError
93     }
94 }
95 
96 macro_rules! syntax_violation_enum {
97     ($($name: ident => $description: expr,)+) => {
98         /// Non-fatal syntax violations that can occur during parsing.
99         ///
100         /// This may be extended in the future so exhaustive matching is
101         /// discouraged with an unused variant.
102         #[derive(PartialEq, Eq, Clone, Copy, Debug)]
103         #[non_exhaustive]
104         pub enum SyntaxViolation {
105             $(
106                 $name,
107             )+
108         }
109 
110         impl SyntaxViolation {
111             pub fn description(&self) -> &'static str {
112                 match *self {
113                     $(
114                         SyntaxViolation::$name => $description,
115                     )+
116                 }
117             }
118         }
119     }
120 }
121 
122 syntax_violation_enum! {
123     Backslash => "backslash",
124     C0SpaceIgnored =>
125         "leading or trailing control or space character are ignored in URLs",
126     EmbeddedCredentials =>
127         "embedding authentication information (username or password) \
128          in an URL is not recommended",
129     ExpectedDoubleSlash => "expected //",
130     ExpectedFileDoubleSlash => "expected // after file:",
131     FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
132     NonUrlCodePoint => "non-URL code point",
133     NullInFragment => "NULL characters are ignored in URL fragment identifiers",
134     PercentDecode => "expected 2 hex digits after %",
135     TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
136     UnencodedAtSign => "unencoded @ sign in username or password",
137 }
138 
139 impl fmt::Display for SyntaxViolation {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result140     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
141         fmt::Display::fmt(self.description(), f)
142     }
143 }
144 
145 #[derive(Copy, Clone, PartialEq, Eq)]
146 pub enum SchemeType {
147     File,
148     SpecialNotFile,
149     NotSpecial,
150 }
151 
152 impl SchemeType {
is_special(&self) -> bool153     pub fn is_special(&self) -> bool {
154         !matches!(*self, SchemeType::NotSpecial)
155     }
156 
is_file(&self) -> bool157     pub fn is_file(&self) -> bool {
158         matches!(*self, SchemeType::File)
159     }
160 
from(s: &str) -> Self161     pub fn from(s: &str) -> Self {
162         match s {
163             "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
164             "file" => SchemeType::File,
165             _ => SchemeType::NotSpecial,
166         }
167     }
168 }
169 
default_port(scheme: &str) -> Option<u16>170 pub fn default_port(scheme: &str) -> Option<u16> {
171     match scheme {
172         "http" | "ws" => Some(80),
173         "https" | "wss" => Some(443),
174         "ftp" => Some(21),
175         _ => None,
176     }
177 }
178 
179 #[derive(Clone)]
180 pub struct Input<'i> {
181     chars: str::Chars<'i>,
182 }
183 
184 impl<'i> Input<'i> {
new(input: &'i str) -> Self185     pub fn new(input: &'i str) -> Self {
186         Input::with_log(input, None)
187     }
188 
no_trim(input: &'i str) -> Self189     pub fn no_trim(input: &'i str) -> Self {
190         Input {
191             chars: input.chars(),
192         }
193     }
194 
trim_tab_and_newlines( original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>, ) -> Self195     pub fn trim_tab_and_newlines(
196         original_input: &'i str,
197         vfn: Option<&dyn Fn(SyntaxViolation)>,
198     ) -> Self {
199         let input = original_input.trim_matches(ascii_tab_or_new_line);
200         if let Some(vfn) = vfn {
201             if input.len() < original_input.len() {
202                 vfn(SyntaxViolation::C0SpaceIgnored)
203             }
204             if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
205                 vfn(SyntaxViolation::TabOrNewlineIgnored)
206             }
207         }
208         Input {
209             chars: input.chars(),
210         }
211     }
212 
with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self213     pub fn with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self {
214         let input = original_input.trim_matches(c0_control_or_space);
215         if let Some(vfn) = vfn {
216             if input.len() < original_input.len() {
217                 vfn(SyntaxViolation::C0SpaceIgnored)
218             }
219             if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
220                 vfn(SyntaxViolation::TabOrNewlineIgnored)
221             }
222         }
223         Input {
224             chars: input.chars(),
225         }
226     }
227 
228     #[inline]
is_empty(&self) -> bool229     pub fn is_empty(&self) -> bool {
230         self.clone().next().is_none()
231     }
232 
233     #[inline]
starts_with<P: Pattern>(&self, p: P) -> bool234     fn starts_with<P: Pattern>(&self, p: P) -> bool {
235         p.split_prefix(&mut self.clone())
236     }
237 
238     #[inline]
split_prefix<P: Pattern>(&self, p: P) -> Option<Self>239     pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
240         let mut remaining = self.clone();
241         if p.split_prefix(&mut remaining) {
242             Some(remaining)
243         } else {
244             None
245         }
246     }
247 
248     #[inline]
split_first(&self) -> (Option<char>, Self)249     fn split_first(&self) -> (Option<char>, Self) {
250         let mut remaining = self.clone();
251         (remaining.next(), remaining)
252     }
253 
254     #[inline]
count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self)255     fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
256         let mut count = 0;
257         let mut remaining = self.clone();
258         loop {
259             let mut input = remaining.clone();
260             if matches!(input.next(), Some(c) if f(c)) {
261                 remaining = input;
262                 count += 1;
263             } else {
264                 return (count, remaining);
265             }
266         }
267     }
268 
269     #[inline]
next_utf8(&mut self) -> Option<(char, &'i str)>270     fn next_utf8(&mut self) -> Option<(char, &'i str)> {
271         loop {
272             let utf8 = self.chars.as_str();
273             match self.chars.next() {
274                 Some(c) => {
275                     if !matches!(c, '\t' | '\n' | '\r') {
276                         return Some((c, &utf8[..c.len_utf8()]));
277                     }
278                 }
279                 None => return None,
280             }
281         }
282     }
283 }
284 
285 pub trait Pattern {
split_prefix(self, input: &mut Input) -> bool286     fn split_prefix(self, input: &mut Input) -> bool;
287 }
288 
289 impl Pattern for char {
split_prefix(self, input: &mut Input) -> bool290     fn split_prefix(self, input: &mut Input) -> bool {
291         input.next() == Some(self)
292     }
293 }
294 
295 impl<'a> Pattern for &'a str {
split_prefix(self, input: &mut Input) -> bool296     fn split_prefix(self, input: &mut Input) -> bool {
297         for c in self.chars() {
298             if input.next() != Some(c) {
299                 return false;
300             }
301         }
302         true
303     }
304 }
305 
306 impl<F: FnMut(char) -> bool> Pattern for F {
split_prefix(self, input: &mut Input) -> bool307     fn split_prefix(self, input: &mut Input) -> bool {
308         input.next().map_or(false, self)
309     }
310 }
311 
312 impl<'i> Iterator for Input<'i> {
313     type Item = char;
next(&mut self) -> Option<char>314     fn next(&mut self) -> Option<char> {
315         self.chars
316             .by_ref()
317             .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
318     }
319 }
320 
321 pub struct Parser<'a> {
322     pub serialization: String,
323     pub base_url: Option<&'a Url>,
324     pub query_encoding_override: EncodingOverride<'a>,
325     pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
326     pub context: Context,
327 }
328 
329 #[derive(PartialEq, Eq, Copy, Clone)]
330 pub enum Context {
331     UrlParser,
332     Setter,
333     PathSegmentSetter,
334 }
335 
336 impl<'a> Parser<'a> {
log_violation(&self, v: SyntaxViolation)337     fn log_violation(&self, v: SyntaxViolation) {
338         if let Some(f) = self.violation_fn {
339             f(v)
340         }
341     }
342 
log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool)343     fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
344         if let Some(f) = self.violation_fn {
345             if test() {
346                 f(v)
347             }
348         }
349     }
350 
for_setter(serialization: String) -> Parser<'a>351     pub fn for_setter(serialization: String) -> Parser<'a> {
352         Parser {
353             serialization,
354             base_url: None,
355             query_encoding_override: None,
356             violation_fn: None,
357             context: Context::Setter,
358         }
359     }
360 
361     /// https://url.spec.whatwg.org/#concept-basic-url-parser
parse_url(mut self, input: &str) -> ParseResult<Url>362     pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
363         let input = Input::with_log(input, self.violation_fn);
364         if let Ok(remaining) = self.parse_scheme(input.clone()) {
365             return self.parse_with_scheme(remaining);
366         }
367 
368         // No-scheme state
369         if let Some(base_url) = self.base_url {
370             if input.starts_with('#') {
371                 self.fragment_only(base_url, input)
372             } else if base_url.cannot_be_a_base() {
373                 Err(ParseError::RelativeUrlWithCannotBeABaseBase)
374             } else {
375                 let scheme_type = SchemeType::from(base_url.scheme());
376                 if scheme_type.is_file() {
377                     self.parse_file(input, scheme_type, Some(base_url))
378                 } else {
379                     self.parse_relative(input, scheme_type, base_url)
380                 }
381             }
382         } else {
383             Err(ParseError::RelativeUrlWithoutBase)
384         }
385     }
386 
parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()>387     pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
388         if input.is_empty() || !input.starts_with(ascii_alpha) {
389             return Err(());
390         }
391         debug_assert!(self.serialization.is_empty());
392         while let Some(c) = input.next() {
393             match c {
394                 'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
395                     self.serialization.push(c.to_ascii_lowercase())
396                 }
397                 ':' => return Ok(input),
398                 _ => {
399                     self.serialization.clear();
400                     return Err(());
401                 }
402             }
403         }
404         // EOF before ':'
405         if self.context == Context::Setter {
406             Ok(input)
407         } else {
408             self.serialization.clear();
409             Err(())
410         }
411     }
412 
parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url>413     fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
414         use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
415         let scheme_end = to_u32(self.serialization.len())?;
416         let scheme_type = SchemeType::from(&self.serialization);
417         self.serialization.push(':');
418         match scheme_type {
419             SchemeType::File => {
420                 self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
421                 let base_file_url = self.base_url.and_then(|base| {
422                     if base.scheme() == "file" {
423                         Some(base)
424                     } else {
425                         None
426                     }
427                 });
428                 self.serialization.clear();
429                 self.parse_file(input, scheme_type, base_file_url)
430             }
431             SchemeType::SpecialNotFile => {
432                 // special relative or authority state
433                 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
434                 if let Some(base_url) = self.base_url {
435                     if slashes_count < 2
436                         && base_url.scheme() == &self.serialization[..scheme_end as usize]
437                     {
438                         // "Cannot-be-a-base" URLs only happen with "not special" schemes.
439                         debug_assert!(!base_url.cannot_be_a_base());
440                         self.serialization.clear();
441                         return self.parse_relative(input, scheme_type, base_url);
442                     }
443                 }
444                 // special authority slashes state
445                 self.log_violation_if(ExpectedDoubleSlash, || {
446                     input
447                         .clone()
448                         .take_while(|&c| matches!(c, '/' | '\\'))
449                         .collect::<String>()
450                         != "//"
451                 });
452                 self.after_double_slash(remaining, scheme_type, scheme_end)
453             }
454             SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
455         }
456     }
457 
458     /// Scheme other than file, http, https, ws, ws, ftp.
parse_non_special( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>459     fn parse_non_special(
460         mut self,
461         input: Input<'_>,
462         scheme_type: SchemeType,
463         scheme_end: u32,
464     ) -> ParseResult<Url> {
465         // path or authority state (
466         if let Some(input) = input.split_prefix("//") {
467             return self.after_double_slash(input, scheme_type, scheme_end);
468         }
469         // Anarchist URL (no authority)
470         let path_start = to_u32(self.serialization.len())?;
471         let username_end = path_start;
472         let host_start = path_start;
473         let host_end = path_start;
474         let host = HostInternal::None;
475         let port = None;
476         let remaining = if let Some(input) = input.split_prefix('/') {
477             let path_start = self.serialization.len();
478             self.serialization.push('/');
479             self.parse_path(scheme_type, &mut false, path_start, input)
480         } else {
481             self.parse_cannot_be_a_base_path(input)
482         };
483         self.with_query_and_fragment(
484             scheme_type,
485             scheme_end,
486             username_end,
487             host_start,
488             host_end,
489             host,
490             port,
491             path_start,
492             remaining,
493         )
494     }
495 
parse_file( mut self, input: Input<'_>, scheme_type: SchemeType, base_file_url: Option<&Url>, ) -> ParseResult<Url>496     fn parse_file(
497         mut self,
498         input: Input<'_>,
499         scheme_type: SchemeType,
500         base_file_url: Option<&Url>,
501     ) -> ParseResult<Url> {
502         use crate::SyntaxViolation::Backslash;
503         // file state
504         debug_assert!(self.serialization.is_empty());
505         let (first_char, input_after_first_char) = input.split_first();
506         if matches!(first_char, Some('/') | Some('\\')) {
507             self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
508             // file slash state
509             let (next_char, input_after_next_char) = input_after_first_char.split_first();
510             if matches!(next_char, Some('/') | Some('\\')) {
511                 self.log_violation_if(Backslash, || next_char == Some('\\'));
512                 // file host state
513                 self.serialization.push_str("file://");
514                 let scheme_end = "file".len() as u32;
515                 let host_start = "file://".len() as u32;
516                 let (path_start, mut host, remaining) =
517                     self.parse_file_host(input_after_next_char)?;
518                 let mut host_end = to_u32(self.serialization.len())?;
519                 let mut has_host = !matches!(host, HostInternal::None);
520                 let remaining = if path_start {
521                     self.parse_path_start(SchemeType::File, &mut has_host, remaining)
522                 } else {
523                     let path_start = self.serialization.len();
524                     self.serialization.push('/');
525                     self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
526                 };
527 
528                 // For file URLs that have a host and whose path starts
529                 // with the windows drive letter we just remove the host.
530                 if !has_host {
531                     self.serialization
532                         .drain(host_start as usize..host_end as usize);
533                     host_end = host_start;
534                     host = HostInternal::None;
535                 }
536                 let (query_start, fragment_start) =
537                     self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
538                 return Ok(Url {
539                     serialization: self.serialization,
540                     scheme_end,
541                     username_end: host_start,
542                     host_start,
543                     host_end,
544                     host,
545                     port: None,
546                     path_start: host_end,
547                     query_start,
548                     fragment_start,
549                 });
550             } else {
551                 self.serialization.push_str("file://");
552                 let scheme_end = "file".len() as u32;
553                 let host_start = "file://".len();
554                 let mut host_end = host_start;
555                 let mut host = HostInternal::None;
556                 if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
557                     if let Some(base_url) = base_file_url {
558                         let first_segment = base_url.path_segments().unwrap().next().unwrap();
559                         if is_normalized_windows_drive_letter(first_segment) {
560                             self.serialization.push('/');
561                             self.serialization.push_str(first_segment);
562                         } else if let Some(host_str) = base_url.host_str() {
563                             self.serialization.push_str(host_str);
564                             host_end = self.serialization.len();
565                             host = base_url.host;
566                         }
567                     }
568                 }
569                 // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
570                 let parse_path_input = if let Some(c) = first_char {
571                     if c == '/' || c == '\\' || c == '?' || c == '#' {
572                         input
573                     } else {
574                         input_after_first_char
575                     }
576                 } else {
577                     input_after_first_char
578                 };
579 
580                 let remaining =
581                     self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
582 
583                 let host_start = host_start as u32;
584 
585                 let (query_start, fragment_start) =
586                     self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
587 
588                 let host_end = host_end as u32;
589                 return Ok(Url {
590                     serialization: self.serialization,
591                     scheme_end,
592                     username_end: host_start,
593                     host_start,
594                     host_end,
595                     host,
596                     port: None,
597                     path_start: host_end,
598                     query_start,
599                     fragment_start,
600                 });
601             }
602         }
603         if let Some(base_url) = base_file_url {
604             match first_char {
605                 None => {
606                     // Copy everything except the fragment
607                     let before_fragment = match base_url.fragment_start {
608                         Some(i) => &base_url.serialization[..i as usize],
609                         None => &*base_url.serialization,
610                     };
611                     self.serialization.push_str(before_fragment);
612                     Ok(Url {
613                         serialization: self.serialization,
614                         fragment_start: None,
615                         ..*base_url
616                     })
617                 }
618                 Some('?') => {
619                     // Copy everything up to the query string
620                     let before_query = match (base_url.query_start, base_url.fragment_start) {
621                         (None, None) => &*base_url.serialization,
622                         (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
623                     };
624                     self.serialization.push_str(before_query);
625                     let (query_start, fragment_start) =
626                         self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
627                     Ok(Url {
628                         serialization: self.serialization,
629                         query_start,
630                         fragment_start,
631                         ..*base_url
632                     })
633                 }
634                 Some('#') => self.fragment_only(base_url, input),
635                 _ => {
636                     if !starts_with_windows_drive_letter_segment(&input) {
637                         let before_query = match (base_url.query_start, base_url.fragment_start) {
638                             (None, None) => &*base_url.serialization,
639                             (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
640                         };
641                         self.serialization.push_str(before_query);
642                         self.shorten_path(SchemeType::File, base_url.path_start as usize);
643                         let remaining = self.parse_path(
644                             SchemeType::File,
645                             &mut true,
646                             base_url.path_start as usize,
647                             input,
648                         );
649                         self.with_query_and_fragment(
650                             SchemeType::File,
651                             base_url.scheme_end,
652                             base_url.username_end,
653                             base_url.host_start,
654                             base_url.host_end,
655                             base_url.host,
656                             base_url.port,
657                             base_url.path_start,
658                             remaining,
659                         )
660                     } else {
661                         self.serialization.push_str("file:///");
662                         let scheme_end = "file".len() as u32;
663                         let path_start = "file://".len();
664                         let remaining =
665                             self.parse_path(SchemeType::File, &mut false, path_start, input);
666                         let (query_start, fragment_start) =
667                             self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
668                         let path_start = path_start as u32;
669                         Ok(Url {
670                             serialization: self.serialization,
671                             scheme_end,
672                             username_end: path_start,
673                             host_start: path_start,
674                             host_end: path_start,
675                             host: HostInternal::None,
676                             port: None,
677                             path_start,
678                             query_start,
679                             fragment_start,
680                         })
681                     }
682                 }
683             }
684         } else {
685             self.serialization.push_str("file:///");
686             let scheme_end = "file".len() as u32;
687             let path_start = "file://".len();
688             let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
689             let (query_start, fragment_start) =
690                 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
691             let path_start = path_start as u32;
692             Ok(Url {
693                 serialization: self.serialization,
694                 scheme_end,
695                 username_end: path_start,
696                 host_start: path_start,
697                 host_end: path_start,
698                 host: HostInternal::None,
699                 port: None,
700                 path_start,
701                 query_start,
702                 fragment_start,
703             })
704         }
705     }
706 
parse_relative( mut self, input: Input<'_>, scheme_type: SchemeType, base_url: &Url, ) -> ParseResult<Url>707     fn parse_relative(
708         mut self,
709         input: Input<'_>,
710         scheme_type: SchemeType,
711         base_url: &Url,
712     ) -> ParseResult<Url> {
713         // relative state
714         debug_assert!(self.serialization.is_empty());
715         let (first_char, input_after_first_char) = input.split_first();
716         match first_char {
717             None => {
718                 // Copy everything except the fragment
719                 let before_fragment = match base_url.fragment_start {
720                     Some(i) => &base_url.serialization[..i as usize],
721                     None => &*base_url.serialization,
722                 };
723                 self.serialization.push_str(before_fragment);
724                 Ok(Url {
725                     serialization: self.serialization,
726                     fragment_start: None,
727                     ..*base_url
728                 })
729             }
730             Some('?') => {
731                 // Copy everything up to the query string
732                 let before_query = match (base_url.query_start, base_url.fragment_start) {
733                     (None, None) => &*base_url.serialization,
734                     (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
735                 };
736                 self.serialization.push_str(before_query);
737                 let (query_start, fragment_start) =
738                     self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
739                 Ok(Url {
740                     serialization: self.serialization,
741                     query_start,
742                     fragment_start,
743                     ..*base_url
744                 })
745             }
746             Some('#') => self.fragment_only(base_url, input),
747             Some('/') | Some('\\') => {
748                 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
749                 if slashes_count >= 2 {
750                     self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
751                         input
752                             .clone()
753                             .take_while(|&c| matches!(c, '/' | '\\'))
754                             .collect::<String>()
755                             != "//"
756                     });
757                     let scheme_end = base_url.scheme_end;
758                     debug_assert!(base_url.byte_at(scheme_end) == b':');
759                     self.serialization
760                         .push_str(base_url.slice(..scheme_end + 1));
761                     if let Some(after_prefix) = input.split_prefix("//") {
762                         return self.after_double_slash(after_prefix, scheme_type, scheme_end);
763                     }
764                     return self.after_double_slash(remaining, scheme_type, scheme_end);
765                 }
766                 let path_start = base_url.path_start;
767                 self.serialization.push_str(base_url.slice(..path_start));
768                 self.serialization.push('/');
769                 let remaining = self.parse_path(
770                     scheme_type,
771                     &mut true,
772                     path_start as usize,
773                     input_after_first_char,
774                 );
775                 self.with_query_and_fragment(
776                     scheme_type,
777                     base_url.scheme_end,
778                     base_url.username_end,
779                     base_url.host_start,
780                     base_url.host_end,
781                     base_url.host,
782                     base_url.port,
783                     base_url.path_start,
784                     remaining,
785                 )
786             }
787             _ => {
788                 let before_query = match (base_url.query_start, base_url.fragment_start) {
789                     (None, None) => &*base_url.serialization,
790                     (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
791                 };
792                 self.serialization.push_str(before_query);
793                 // FIXME spec says just "remove last entry", not the "pop" algorithm
794                 self.pop_path(scheme_type, base_url.path_start as usize);
795                 // A special url always has a path.
796                 // A path always starts with '/'
797                 if self.serialization.len() == base_url.path_start as usize
798                     && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
799                 {
800                     self.serialization.push('/');
801                 }
802                 let remaining = match input.split_first() {
803                     (Some('/'), remaining) => self.parse_path(
804                         scheme_type,
805                         &mut true,
806                         base_url.path_start as usize,
807                         remaining,
808                     ),
809                     _ => {
810                         self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
811                     }
812                 };
813                 self.with_query_and_fragment(
814                     scheme_type,
815                     base_url.scheme_end,
816                     base_url.username_end,
817                     base_url.host_start,
818                     base_url.host_end,
819                     base_url.host,
820                     base_url.port,
821                     base_url.path_start,
822                     remaining,
823                 )
824             }
825         }
826     }
827 
after_double_slash( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>828     fn after_double_slash(
829         mut self,
830         input: Input<'_>,
831         scheme_type: SchemeType,
832         scheme_end: u32,
833     ) -> ParseResult<Url> {
834         self.serialization.push('/');
835         self.serialization.push('/');
836         // authority state
837         let before_authority = self.serialization.len();
838         let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
839         let has_authority = before_authority != self.serialization.len();
840         // host state
841         let host_start = to_u32(self.serialization.len())?;
842         let (host_end, host, port, remaining) =
843             self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
844         if host == HostInternal::None && has_authority {
845             return Err(ParseError::EmptyHost);
846         }
847         // path state
848         let path_start = to_u32(self.serialization.len())?;
849         let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
850         self.with_query_and_fragment(
851             scheme_type,
852             scheme_end,
853             username_end,
854             host_start,
855             host_end,
856             host,
857             port,
858             path_start,
859             remaining,
860         )
861     }
862 
863     /// Return (username_end, remaining)
parse_userinfo<'i>( &mut self, mut input: Input<'i>, scheme_type: SchemeType, ) -> ParseResult<(u32, Input<'i>)>864     fn parse_userinfo<'i>(
865         &mut self,
866         mut input: Input<'i>,
867         scheme_type: SchemeType,
868     ) -> ParseResult<(u32, Input<'i>)> {
869         let mut last_at = None;
870         let mut remaining = input.clone();
871         let mut char_count = 0;
872         while let Some(c) = remaining.next() {
873             match c {
874                 '@' => {
875                     if last_at.is_some() {
876                         self.log_violation(SyntaxViolation::UnencodedAtSign)
877                     } else {
878                         self.log_violation(SyntaxViolation::EmbeddedCredentials)
879                     }
880                     last_at = Some((char_count, remaining.clone()))
881                 }
882                 '/' | '?' | '#' => break,
883                 '\\' if scheme_type.is_special() => break,
884                 _ => (),
885             }
886             char_count += 1;
887         }
888         let (mut userinfo_char_count, remaining) = match last_at {
889             None => return Ok((to_u32(self.serialization.len())?, input)),
890             Some((0, remaining)) => {
891                 // Otherwise, if one of the following is true
892                 // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
893                 // url is special and c is U+005C (\)
894                 // If @ flag is set and buffer is the empty string, validation error, return failure.
895                 if let (Some(c), _) = remaining.split_first() {
896                     if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
897                         return Err(ParseError::EmptyHost);
898                     }
899                 }
900                 return Ok((to_u32(self.serialization.len())?, remaining));
901             }
902             Some(x) => x,
903         };
904 
905         let mut username_end = None;
906         let mut has_password = false;
907         let mut has_username = false;
908         while userinfo_char_count > 0 {
909             let (c, utf8_c) = input.next_utf8().unwrap();
910             userinfo_char_count -= 1;
911             if c == ':' && username_end.is_none() {
912                 // Start parsing password
913                 username_end = Some(to_u32(self.serialization.len())?);
914                 // We don't add a colon if the password is empty
915                 if userinfo_char_count > 0 {
916                     self.serialization.push(':');
917                     has_password = true;
918                 }
919             } else {
920                 if !has_password {
921                     has_username = true;
922                 }
923                 self.check_url_code_point(c, &input);
924                 self.serialization
925                     .extend(utf8_percent_encode(utf8_c, USERINFO));
926             }
927         }
928         let username_end = match username_end {
929             Some(i) => i,
930             None => to_u32(self.serialization.len())?,
931         };
932         if has_username || has_password {
933             self.serialization.push('@');
934         }
935         Ok((username_end, remaining))
936     }
937 
parse_host_and_port<'i>( &mut self, input: Input<'i>, scheme_end: u32, scheme_type: SchemeType, ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)>938     fn parse_host_and_port<'i>(
939         &mut self,
940         input: Input<'i>,
941         scheme_end: u32,
942         scheme_type: SchemeType,
943     ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
944         let (host, remaining) = Parser::parse_host(input, scheme_type)?;
945         write!(&mut self.serialization, "{}", host).unwrap();
946         let host_end = to_u32(self.serialization.len())?;
947         if let Host::Domain(h) = &host {
948             if h.is_empty() {
949                 // Port with an empty host
950                 if remaining.starts_with(":") {
951                     return Err(ParseError::EmptyHost);
952                 }
953                 if scheme_type.is_special() {
954                     return Err(ParseError::EmptyHost);
955                 }
956             }
957         };
958 
959         let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
960             let scheme = || default_port(&self.serialization[..scheme_end as usize]);
961             Parser::parse_port(remaining, scheme, self.context)?
962         } else {
963             (None, remaining)
964         };
965         if let Some(port) = port {
966             write!(&mut self.serialization, ":{}", port).unwrap()
967         }
968         Ok((host_end, host.into(), port, remaining))
969     }
970 
parse_host( mut input: Input<'_>, scheme_type: SchemeType, ) -> ParseResult<(Host<String>, Input<'_>)>971     pub fn parse_host(
972         mut input: Input<'_>,
973         scheme_type: SchemeType,
974     ) -> ParseResult<(Host<String>, Input<'_>)> {
975         if scheme_type.is_file() {
976             return Parser::get_file_host(input);
977         }
978         // Undo the Input abstraction here to avoid allocating in the common case
979         // where the host part of the input does not contain any tab or newline
980         let input_str = input.chars.as_str();
981         let mut inside_square_brackets = false;
982         let mut has_ignored_chars = false;
983         let mut non_ignored_chars = 0;
984         let mut bytes = 0;
985         for c in input_str.chars() {
986             match c {
987                 ':' if !inside_square_brackets => break,
988                 '\\' if scheme_type.is_special() => break,
989                 '/' | '?' | '#' => break,
990                 '\t' | '\n' | '\r' => {
991                     has_ignored_chars = true;
992                 }
993                 '[' => {
994                     inside_square_brackets = true;
995                     non_ignored_chars += 1
996                 }
997                 ']' => {
998                     inside_square_brackets = false;
999                     non_ignored_chars += 1
1000                 }
1001                 _ => non_ignored_chars += 1,
1002             }
1003             bytes += c.len_utf8();
1004         }
1005         let replaced: String;
1006         let host_str;
1007         {
1008             let host_input = input.by_ref().take(non_ignored_chars);
1009             if has_ignored_chars {
1010                 replaced = host_input.collect();
1011                 host_str = &*replaced
1012             } else {
1013                 for _ in host_input {}
1014                 host_str = &input_str[..bytes]
1015             }
1016         }
1017         if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1018             return Err(ParseError::EmptyHost);
1019         }
1020         if !scheme_type.is_special() {
1021             let host = Host::parse_opaque(host_str)?;
1022             return Ok((host, input));
1023         }
1024         let host = Host::parse(host_str)?;
1025         Ok((host, input))
1026     }
1027 
get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)>1028     fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1029         let (_, host_str, remaining) = Parser::file_host(input)?;
1030         let host = match Host::parse(&host_str)? {
1031             Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1032             host => host,
1033         };
1034         Ok((host, remaining))
1035     }
1036 
parse_file_host<'i>( &mut self, input: Input<'i>, ) -> ParseResult<(bool, HostInternal, Input<'i>)>1037     fn parse_file_host<'i>(
1038         &mut self,
1039         input: Input<'i>,
1040     ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1041         let has_host;
1042         let (_, host_str, remaining) = Parser::file_host(input)?;
1043         let host = if host_str.is_empty() {
1044             has_host = false;
1045             HostInternal::None
1046         } else {
1047             match Host::parse(&host_str)? {
1048                 Host::Domain(ref d) if d == "localhost" => {
1049                     has_host = false;
1050                     HostInternal::None
1051                 }
1052                 host => {
1053                     write!(&mut self.serialization, "{}", host).unwrap();
1054                     has_host = true;
1055                     host.into()
1056                 }
1057             }
1058         };
1059         Ok((has_host, host, remaining))
1060     }
1061 
file_host(input: Input) -> ParseResult<(bool, String, Input)>1062     pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1063         // Undo the Input abstraction here to avoid allocating in the common case
1064         // where the host part of the input does not contain any tab or newline
1065         let input_str = input.chars.as_str();
1066         let mut has_ignored_chars = false;
1067         let mut non_ignored_chars = 0;
1068         let mut bytes = 0;
1069         for c in input_str.chars() {
1070             match c {
1071                 '/' | '\\' | '?' | '#' => break,
1072                 '\t' | '\n' | '\r' => has_ignored_chars = true,
1073                 _ => non_ignored_chars += 1,
1074             }
1075             bytes += c.len_utf8();
1076         }
1077         let replaced: String;
1078         let host_str;
1079         let mut remaining = input.clone();
1080         {
1081             let host_input = remaining.by_ref().take(non_ignored_chars);
1082             if has_ignored_chars {
1083                 replaced = host_input.collect();
1084                 host_str = &*replaced
1085             } else {
1086                 for _ in host_input {}
1087                 host_str = &input_str[..bytes]
1088             }
1089         }
1090         if is_windows_drive_letter(host_str) {
1091             return Ok((false, "".to_string(), input));
1092         }
1093         Ok((true, host_str.to_string(), remaining))
1094     }
1095 
parse_port<P>( mut input: Input<'_>, default_port: P, context: Context, ) -> ParseResult<(Option<u16>, Input<'_>)> where P: Fn() -> Option<u16>,1096     pub fn parse_port<P>(
1097         mut input: Input<'_>,
1098         default_port: P,
1099         context: Context,
1100     ) -> ParseResult<(Option<u16>, Input<'_>)>
1101     where
1102         P: Fn() -> Option<u16>,
1103     {
1104         let mut port: u32 = 0;
1105         let mut has_any_digit = false;
1106         while let (Some(c), remaining) = input.split_first() {
1107             if let Some(digit) = c.to_digit(10) {
1108                 port = port * 10 + digit;
1109                 if port > ::std::u16::MAX as u32 {
1110                     return Err(ParseError::InvalidPort);
1111                 }
1112                 has_any_digit = true;
1113             } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1114                 return Err(ParseError::InvalidPort);
1115             } else {
1116                 break;
1117             }
1118             input = remaining;
1119         }
1120         let mut opt_port = Some(port as u16);
1121         if !has_any_digit || opt_port == default_port() {
1122             opt_port = None;
1123         }
1124         Ok((opt_port, input))
1125     }
1126 
parse_path_start<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, input: Input<'i>, ) -> Input<'i>1127     pub fn parse_path_start<'i>(
1128         &mut self,
1129         scheme_type: SchemeType,
1130         has_host: &mut bool,
1131         input: Input<'i>,
1132     ) -> Input<'i> {
1133         let path_start = self.serialization.len();
1134         let (maybe_c, remaining) = input.split_first();
1135         // If url is special, then:
1136         if scheme_type.is_special() {
1137             if maybe_c == Some('\\') {
1138                 // If c is U+005C (\), validation error.
1139                 self.log_violation(SyntaxViolation::Backslash);
1140             }
1141             // A special URL always has a non-empty path.
1142             if !self.serialization.ends_with('/') {
1143                 self.serialization.push('/');
1144                 // We have already made sure the forward slash is present.
1145                 if maybe_c == Some('/') || maybe_c == Some('\\') {
1146                     return self.parse_path(scheme_type, has_host, path_start, remaining);
1147                 }
1148             }
1149             return self.parse_path(scheme_type, has_host, path_start, input);
1150         } else if maybe_c == Some('?') || maybe_c == Some('#') {
1151             // Otherwise, if state override is not given and c is U+003F (?),
1152             // set url’s query to the empty string and state to query state.
1153             // Otherwise, if state override is not given and c is U+0023 (#),
1154             // set url’s fragment to the empty string and state to fragment state.
1155             // The query and path states will be handled by the caller.
1156             return input;
1157         }
1158 
1159         if maybe_c != None && maybe_c != Some('/') {
1160             self.serialization.push('/');
1161         }
1162         // Otherwise, if c is not the EOF code point:
1163         self.parse_path(scheme_type, has_host, path_start, input)
1164     }
1165 
parse_path<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, path_start: usize, mut input: Input<'i>, ) -> Input<'i>1166     pub fn parse_path<'i>(
1167         &mut self,
1168         scheme_type: SchemeType,
1169         has_host: &mut bool,
1170         path_start: usize,
1171         mut input: Input<'i>,
1172     ) -> Input<'i> {
1173         // Relative path state
1174         loop {
1175             let segment_start = self.serialization.len();
1176             let mut ends_with_slash = false;
1177             loop {
1178                 let input_before_c = input.clone();
1179                 let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1180                     x
1181                 } else {
1182                     break;
1183                 };
1184                 match c {
1185                     '/' if self.context != Context::PathSegmentSetter => {
1186                         self.serialization.push(c);
1187                         ends_with_slash = true;
1188                         break;
1189                     }
1190                     '\\' if self.context != Context::PathSegmentSetter
1191                         && scheme_type.is_special() =>
1192                     {
1193                         self.log_violation(SyntaxViolation::Backslash);
1194                         self.serialization.push('/');
1195                         ends_with_slash = true;
1196                         break;
1197                     }
1198                     '?' | '#' if self.context == Context::UrlParser => {
1199                         input = input_before_c;
1200                         break;
1201                     }
1202                     _ => {
1203                         self.check_url_code_point(c, &input);
1204                         if self.context == Context::PathSegmentSetter {
1205                             if scheme_type.is_special() {
1206                                 self.serialization
1207                                     .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1208                             } else {
1209                                 self.serialization
1210                                     .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1211                             }
1212                         } else {
1213                             self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1214                         }
1215                     }
1216                 }
1217             }
1218             let segment_before_slash = if ends_with_slash {
1219                 &self.serialization[segment_start..self.serialization.len() - 1]
1220             } else {
1221                 &self.serialization[segment_start..self.serialization.len()]
1222             };
1223             match segment_before_slash {
1224                 // If buffer is a double-dot path segment, shorten url’s path,
1225                 ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1226                 | ".%2E" => {
1227                     debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1228                     self.serialization.truncate(segment_start);
1229                     if self.serialization.ends_with('/')
1230                         && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1231                     {
1232                         self.serialization.pop();
1233                     }
1234                     self.shorten_path(scheme_type, path_start);
1235 
1236                     // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1237                     if ends_with_slash && !self.serialization.ends_with('/') {
1238                         self.serialization.push('/');
1239                     }
1240                 }
1241                 // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1242                 // nor url is special and c is U+005C (\), append the empty string to url’s path.
1243                 "." | "%2e" | "%2E" => {
1244                     self.serialization.truncate(segment_start);
1245                     if !self.serialization.ends_with('/') {
1246                         self.serialization.push('/');
1247                     }
1248                 }
1249                 _ => {
1250                     // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1251                     if scheme_type.is_file() && is_windows_drive_letter(segment_before_slash) {
1252                         // Replace the second code point in buffer with U+003A (:).
1253                         if let Some(c) = segment_before_slash.chars().next() {
1254                             self.serialization.truncate(segment_start);
1255                             self.serialization.push(c);
1256                             self.serialization.push(':');
1257                             if ends_with_slash {
1258                                 self.serialization.push('/');
1259                             }
1260                         }
1261                         // If url’s host is neither the empty string nor null,
1262                         // validation error, set url’s host to the empty string.
1263                         if *has_host {
1264                             self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1265                             *has_host = false; // FIXME account for this in callers
1266                         }
1267                     }
1268                 }
1269             }
1270             if !ends_with_slash {
1271                 break;
1272             }
1273         }
1274         if scheme_type.is_file() {
1275             // while url’s path’s size is greater than 1
1276             // and url’s path[0] is the empty string,
1277             // validation error, remove the first item from url’s path.
1278             //FIXME: log violation
1279             let path = self.serialization.split_off(path_start);
1280             self.serialization.push('/');
1281             self.serialization.push_str(path.trim_start_matches('/'));
1282         }
1283 
1284         input
1285     }
1286 
last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool1287     fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1288         let url_before_segment = &serialization[..serialization.len() - 1];
1289         if let Some(segment_before_start) = url_before_segment.rfind('/') {
1290             // Do not remove the root slash
1291             segment_before_start >= path_start
1292                 // Or a windows drive letter slash
1293                 && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1294         } else {
1295             false
1296         }
1297     }
1298 
1299     /// https://url.spec.whatwg.org/#shorten-a-urls-path
shorten_path(&mut self, scheme_type: SchemeType, path_start: usize)1300     fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1301         // If path is empty, then return.
1302         if self.serialization.len() == path_start {
1303             return;
1304         }
1305         // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1306         if scheme_type.is_file()
1307             && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1308         {
1309             return;
1310         }
1311         // Remove path’s last item.
1312         self.pop_path(scheme_type, path_start);
1313     }
1314 
1315     /// https://url.spec.whatwg.org/#pop-a-urls-path
pop_path(&mut self, scheme_type: SchemeType, path_start: usize)1316     fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1317         if self.serialization.len() > path_start {
1318             let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1319             // + 1 since rfind returns the position before the slash.
1320             let segment_start = path_start + slash_position + 1;
1321             // Don’t pop a Windows drive letter
1322             if !(scheme_type.is_file()
1323                 && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1324             {
1325                 self.serialization.truncate(segment_start);
1326             }
1327         }
1328     }
1329 
parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i>1330     pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1331         loop {
1332             let input_before_c = input.clone();
1333             match input.next_utf8() {
1334                 Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1335                     return input_before_c
1336                 }
1337                 Some((c, utf8_c)) => {
1338                     self.check_url_code_point(c, &input);
1339                     self.serialization
1340                         .extend(utf8_percent_encode(utf8_c, CONTROLS));
1341                 }
1342                 None => return input,
1343             }
1344         }
1345     }
1346 
1347     #[allow(clippy::too_many_arguments)]
with_query_and_fragment( mut self, scheme_type: SchemeType, scheme_end: u32, username_end: u32, host_start: u32, host_end: u32, host: HostInternal, port: Option<u16>, path_start: u32, remaining: Input<'_>, ) -> ParseResult<Url>1348     fn with_query_and_fragment(
1349         mut self,
1350         scheme_type: SchemeType,
1351         scheme_end: u32,
1352         username_end: u32,
1353         host_start: u32,
1354         host_end: u32,
1355         host: HostInternal,
1356         port: Option<u16>,
1357         path_start: u32,
1358         remaining: Input<'_>,
1359     ) -> ParseResult<Url> {
1360         let (query_start, fragment_start) =
1361             self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1362         Ok(Url {
1363             serialization: self.serialization,
1364             scheme_end,
1365             username_end,
1366             host_start,
1367             host_end,
1368             host,
1369             port,
1370             path_start,
1371             query_start,
1372             fragment_start,
1373         })
1374     }
1375 
1376     /// Return (query_start, fragment_start)
parse_query_and_fragment( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'_>, ) -> ParseResult<(Option<u32>, Option<u32>)>1377     fn parse_query_and_fragment(
1378         &mut self,
1379         scheme_type: SchemeType,
1380         scheme_end: u32,
1381         mut input: Input<'_>,
1382     ) -> ParseResult<(Option<u32>, Option<u32>)> {
1383         let mut query_start = None;
1384         match input.next() {
1385             Some('#') => {}
1386             Some('?') => {
1387                 query_start = Some(to_u32(self.serialization.len())?);
1388                 self.serialization.push('?');
1389                 let remaining = self.parse_query(scheme_type, scheme_end, input);
1390                 if let Some(remaining) = remaining {
1391                     input = remaining
1392                 } else {
1393                     return Ok((query_start, None));
1394                 }
1395             }
1396             None => return Ok((None, None)),
1397             _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1398         }
1399 
1400         let fragment_start = to_u32(self.serialization.len())?;
1401         self.serialization.push('#');
1402         self.parse_fragment(input);
1403         Ok((query_start, Some(fragment_start)))
1404     }
1405 
parse_query<'i>( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'i>, ) -> Option<Input<'i>>1406     pub fn parse_query<'i>(
1407         &mut self,
1408         scheme_type: SchemeType,
1409         scheme_end: u32,
1410         mut input: Input<'i>,
1411     ) -> Option<Input<'i>> {
1412         let len = input.chars.as_str().len();
1413         let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1414         let mut remaining = None;
1415         while let Some(c) = input.next() {
1416             if c == '#' && self.context == Context::UrlParser {
1417                 remaining = Some(input);
1418                 break;
1419             } else {
1420                 self.check_url_code_point(c, &input);
1421                 query.push(c);
1422             }
1423         }
1424 
1425         let encoding = match &self.serialization[..scheme_end as usize] {
1426             "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1427             _ => None,
1428         };
1429         let query_bytes = if let Some(o) = encoding {
1430             o(&query)
1431         } else {
1432             query.as_bytes().into()
1433         };
1434         let set = if scheme_type.is_special() {
1435             SPECIAL_QUERY
1436         } else {
1437             QUERY
1438         };
1439         self.serialization.extend(percent_encode(&query_bytes, set));
1440         remaining
1441     }
1442 
fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url>1443     fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1444         let before_fragment = match base_url.fragment_start {
1445             Some(i) => base_url.slice(..i),
1446             None => &*base_url.serialization,
1447         };
1448         debug_assert!(self.serialization.is_empty());
1449         self.serialization
1450             .reserve(before_fragment.len() + input.chars.as_str().len());
1451         self.serialization.push_str(before_fragment);
1452         self.serialization.push('#');
1453         let next = input.next();
1454         debug_assert!(next == Some('#'));
1455         self.parse_fragment(input);
1456         Ok(Url {
1457             serialization: self.serialization,
1458             fragment_start: Some(to_u32(before_fragment.len())?),
1459             ..*base_url
1460         })
1461     }
1462 
parse_fragment(&mut self, mut input: Input<'_>)1463     pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1464         while let Some((c, utf8_c)) = input.next_utf8() {
1465             if c == '\0' {
1466                 self.log_violation(SyntaxViolation::NullInFragment)
1467             } else {
1468                 self.check_url_code_point(c, &input);
1469             }
1470             self.serialization
1471                 .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1472         }
1473     }
1474 
check_url_code_point(&self, c: char, input: &Input<'_>)1475     fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1476         if let Some(vfn) = self.violation_fn {
1477             if c == '%' {
1478                 let mut input = input.clone();
1479                 if !matches!((input.next(), input.next()), (Some(a), Some(b))
1480                              if is_ascii_hex_digit(a) && is_ascii_hex_digit(b))
1481                 {
1482                     vfn(SyntaxViolation::PercentDecode)
1483                 }
1484             } else if !is_url_code_point(c) {
1485                 vfn(SyntaxViolation::NonUrlCodePoint)
1486             }
1487         }
1488     }
1489 }
1490 
1491 #[inline]
is_ascii_hex_digit(c: char) -> bool1492 fn is_ascii_hex_digit(c: char) -> bool {
1493     matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9')
1494 }
1495 
1496 // Non URL code points:
1497 // U+0000 to U+0020 (space)
1498 // " # % < > [ \ ] ^ ` { | }
1499 // U+007F to U+009F
1500 // surrogates
1501 // U+FDD0 to U+FDEF
1502 // Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1503 #[inline]
is_url_code_point(c: char) -> bool1504 fn is_url_code_point(c: char) -> bool {
1505     matches!(c,
1506         'a'..='z' |
1507         'A'..='Z' |
1508         '0'..='9' |
1509         '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1510         '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1511         '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1512         '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1513         '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1514         '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1515         '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1516         '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1517         '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1518         '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1519         '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1520 }
1521 
1522 /// https://url.spec.whatwg.org/#c0-controls-and-space
1523 #[inline]
c0_control_or_space(ch: char) -> bool1524 fn c0_control_or_space(ch: char) -> bool {
1525     ch <= ' ' // U+0000 to U+0020
1526 }
1527 
1528 /// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1529 #[inline]
ascii_tab_or_new_line(ch: char) -> bool1530 fn ascii_tab_or_new_line(ch: char) -> bool {
1531     matches!(ch, '\t' | '\r' | '\n')
1532 }
1533 
1534 /// https://url.spec.whatwg.org/#ascii-alpha
1535 #[inline]
ascii_alpha(ch: char) -> bool1536 pub fn ascii_alpha(ch: char) -> bool {
1537     matches!(ch, 'a'..='z' | 'A'..='Z')
1538 }
1539 
1540 #[inline]
to_u32(i: usize) -> ParseResult<u32>1541 pub fn to_u32(i: usize) -> ParseResult<u32> {
1542     if i <= ::std::u32::MAX as usize {
1543         Ok(i as u32)
1544     } else {
1545         Err(ParseError::Overflow)
1546     }
1547 }
1548 
is_normalized_windows_drive_letter(segment: &str) -> bool1549 fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1550     is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1551 }
1552 
1553 /// Whether the scheme is file:, the path has a single segment, and that segment
1554 /// is a Windows drive letter
1555 #[inline]
is_windows_drive_letter(segment: &str) -> bool1556 pub fn is_windows_drive_letter(segment: &str) -> bool {
1557     segment.len() == 2 && starts_with_windows_drive_letter(segment)
1558 }
1559 
1560 /// Whether path starts with a root slash
1561 /// and a windows drive letter eg: "/c:" or "/a:/"
path_starts_with_windows_drive_letter(s: &str) -> bool1562 fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1563     if let Some(c) = s.as_bytes().first() {
1564         matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1565     } else {
1566         false
1567     }
1568 }
1569 
starts_with_windows_drive_letter(s: &str) -> bool1570 fn starts_with_windows_drive_letter(s: &str) -> bool {
1571     s.len() >= 2
1572         && ascii_alpha(s.as_bytes()[0] as char)
1573         && matches!(s.as_bytes()[1], b':' | b'|')
1574         && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1575 }
1576 
1577 /// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool1578 fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1579     let mut input = input.clone();
1580     match (input.next(), input.next(), input.next()) {
1581         // its first two code points are a Windows drive letter
1582         // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1583         (Some(a), Some(b), Some(c))
1584             if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1585         {
1586             true
1587         }
1588         // its first two code points are a Windows drive letter
1589         // its length is 2
1590         (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1591         _ => false,
1592     }
1593 }
1594