1 // Copyright 2013-2016 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 use std::error::Error;
10 use std::fmt::{self, Formatter, Write};
11 use std::str;
12
13 use crate::host::{Host, HostInternal};
14 use crate::Url;
15 use form_urlencoded::EncodingOverride;
16 use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17
18 /// https://url.spec.whatwg.org/#fragment-percent-encode-set
19 const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
20
21 /// https://url.spec.whatwg.org/#path-percent-encode-set
22 const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
23
24 /// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25 pub(crate) const USERINFO: &AsciiSet = &PATH
26 .add(b'/')
27 .add(b':')
28 .add(b';')
29 .add(b'=')
30 .add(b'@')
31 .add(b'[')
32 .add(b'\\')
33 .add(b']')
34 .add(b'^')
35 .add(b'|');
36
37 pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
38
39 // The backslash (\) character is treated as a path separator in special URLs
40 // so it needs to be additionally escaped in that case.
41 pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
42
43 // https://url.spec.whatwg.org/#query-state
44 const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
45 const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
46
47 pub type ParseResult<T> = Result<T, ParseError>;
48
49 macro_rules! simple_enum_error {
50 ($($name: ident => $description: expr,)+) => {
51 /// Errors that can occur during parsing.
52 ///
53 /// This may be extended in the future so exhaustive matching is
54 /// discouraged with an unused variant.
55 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
56 #[non_exhaustive]
57 pub enum ParseError {
58 $(
59 $name,
60 )+
61 }
62
63 impl fmt::Display for ParseError {
64 fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
65 match *self {
66 $(
67 ParseError::$name => fmt.write_str($description),
68 )+
69 }
70 }
71 }
72 }
73 }
74
75 impl Error for ParseError {}
76
77 simple_enum_error! {
78 EmptyHost => "empty host",
79 IdnaError => "invalid international domain name",
80 InvalidPort => "invalid port number",
81 InvalidIpv4Address => "invalid IPv4 address",
82 InvalidIpv6Address => "invalid IPv6 address",
83 InvalidDomainCharacter => "invalid domain character",
84 RelativeUrlWithoutBase => "relative URL without a base",
85 RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
86 SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
87 Overflow => "URLs more than 4 GB are not supported",
88 }
89
90 impl From<::idna::Errors> for ParseError {
from(_: ::idna::Errors) -> ParseError91 fn from(_: ::idna::Errors) -> ParseError {
92 ParseError::IdnaError
93 }
94 }
95
96 macro_rules! syntax_violation_enum {
97 ($($name: ident => $description: literal,)+) => {
98 /// Non-fatal syntax violations that can occur during parsing.
99 ///
100 /// This may be extended in the future so exhaustive matching is
101 /// forbidden.
102 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
103 #[non_exhaustive]
104 pub enum SyntaxViolation {
105 $(
106 /// ```text
107 #[doc = $description]
108 /// ```
109 $name,
110 )+
111 }
112
113 impl SyntaxViolation {
114 pub fn description(&self) -> &'static str {
115 match *self {
116 $(
117 SyntaxViolation::$name => $description,
118 )+
119 }
120 }
121 }
122 }
123 }
124
125 syntax_violation_enum! {
126 Backslash => "backslash",
127 C0SpaceIgnored =>
128 "leading or trailing control or space character are ignored in URLs",
129 EmbeddedCredentials =>
130 "embedding authentication information (username or password) \
131 in an URL is not recommended",
132 ExpectedDoubleSlash => "expected //",
133 ExpectedFileDoubleSlash => "expected // after file:",
134 FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
135 NonUrlCodePoint => "non-URL code point",
136 NullInFragment => "NULL characters are ignored in URL fragment identifiers",
137 PercentDecode => "expected 2 hex digits after %",
138 TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
139 UnencodedAtSign => "unencoded @ sign in username or password",
140 }
141
142 impl fmt::Display for SyntaxViolation {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result143 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
144 fmt::Display::fmt(self.description(), f)
145 }
146 }
147
148 #[derive(Copy, Clone, PartialEq, Eq)]
149 pub enum SchemeType {
150 File,
151 SpecialNotFile,
152 NotSpecial,
153 }
154
155 impl SchemeType {
is_special(&self) -> bool156 pub fn is_special(&self) -> bool {
157 !matches!(*self, SchemeType::NotSpecial)
158 }
159
is_file(&self) -> bool160 pub fn is_file(&self) -> bool {
161 matches!(*self, SchemeType::File)
162 }
163 }
164
165 impl<T: AsRef<str>> From<T> for SchemeType {
from(s: T) -> Self166 fn from(s: T) -> Self {
167 match s.as_ref() {
168 "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
169 "file" => SchemeType::File,
170 _ => SchemeType::NotSpecial,
171 }
172 }
173 }
174
default_port(scheme: &str) -> Option<u16>175 pub fn default_port(scheme: &str) -> Option<u16> {
176 match scheme {
177 "http" | "ws" => Some(80),
178 "https" | "wss" => Some(443),
179 "ftp" => Some(21),
180 _ => None,
181 }
182 }
183
184 #[derive(Clone, Debug)]
185 pub struct Input<'i> {
186 chars: str::Chars<'i>,
187 }
188
189 impl<'i> Input<'i> {
new_no_trim(input: &'i str) -> Self190 pub fn new_no_trim(input: &'i str) -> Self {
191 Input {
192 chars: input.chars(),
193 }
194 }
195
new_trim_tab_and_newlines( original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>, ) -> Self196 pub fn new_trim_tab_and_newlines(
197 original_input: &'i str,
198 vfn: Option<&dyn Fn(SyntaxViolation)>,
199 ) -> Self {
200 let input = original_input.trim_matches(ascii_tab_or_new_line);
201 if let Some(vfn) = vfn {
202 if input.len() < original_input.len() {
203 vfn(SyntaxViolation::C0SpaceIgnored)
204 }
205 if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
206 vfn(SyntaxViolation::TabOrNewlineIgnored)
207 }
208 }
209 Input {
210 chars: input.chars(),
211 }
212 }
213
new_trim_c0_control_and_space( original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>, ) -> Self214 pub fn new_trim_c0_control_and_space(
215 original_input: &'i str,
216 vfn: Option<&dyn Fn(SyntaxViolation)>,
217 ) -> Self {
218 let input = original_input.trim_matches(c0_control_or_space);
219 if let Some(vfn) = vfn {
220 if input.len() < original_input.len() {
221 vfn(SyntaxViolation::C0SpaceIgnored)
222 }
223 if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
224 vfn(SyntaxViolation::TabOrNewlineIgnored)
225 }
226 }
227 Input {
228 chars: input.chars(),
229 }
230 }
231
232 #[inline]
is_empty(&self) -> bool233 pub fn is_empty(&self) -> bool {
234 self.clone().next().is_none()
235 }
236
237 #[inline]
starts_with<P: Pattern>(&self, p: P) -> bool238 fn starts_with<P: Pattern>(&self, p: P) -> bool {
239 p.split_prefix(&mut self.clone())
240 }
241
242 #[inline]
split_prefix<P: Pattern>(&self, p: P) -> Option<Self>243 pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
244 let mut remaining = self.clone();
245 if p.split_prefix(&mut remaining) {
246 Some(remaining)
247 } else {
248 None
249 }
250 }
251
252 #[inline]
split_first(&self) -> (Option<char>, Self)253 fn split_first(&self) -> (Option<char>, Self) {
254 let mut remaining = self.clone();
255 (remaining.next(), remaining)
256 }
257
258 #[inline]
count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self)259 fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
260 let mut count = 0;
261 let mut remaining = self.clone();
262 loop {
263 let mut input = remaining.clone();
264 if matches!(input.next(), Some(c) if f(c)) {
265 remaining = input;
266 count += 1;
267 } else {
268 return (count, remaining);
269 }
270 }
271 }
272
273 #[inline]
next_utf8(&mut self) -> Option<(char, &'i str)>274 fn next_utf8(&mut self) -> Option<(char, &'i str)> {
275 loop {
276 let utf8 = self.chars.as_str();
277 match self.chars.next() {
278 Some(c) => {
279 if !matches!(c, '\t' | '\n' | '\r') {
280 return Some((c, &utf8[..c.len_utf8()]));
281 }
282 }
283 None => return None,
284 }
285 }
286 }
287 }
288
289 pub trait Pattern {
split_prefix(self, input: &mut Input) -> bool290 fn split_prefix(self, input: &mut Input) -> bool;
291 }
292
293 impl Pattern for char {
split_prefix(self, input: &mut Input) -> bool294 fn split_prefix(self, input: &mut Input) -> bool {
295 input.next() == Some(self)
296 }
297 }
298
299 impl<'a> Pattern for &'a str {
split_prefix(self, input: &mut Input) -> bool300 fn split_prefix(self, input: &mut Input) -> bool {
301 for c in self.chars() {
302 if input.next() != Some(c) {
303 return false;
304 }
305 }
306 true
307 }
308 }
309
310 impl<F: FnMut(char) -> bool> Pattern for F {
split_prefix(self, input: &mut Input) -> bool311 fn split_prefix(self, input: &mut Input) -> bool {
312 input.next().map_or(false, self)
313 }
314 }
315
316 impl<'i> Iterator for Input<'i> {
317 type Item = char;
next(&mut self) -> Option<char>318 fn next(&mut self) -> Option<char> {
319 self.chars
320 .by_ref()
321 .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
322 }
323 }
324
325 pub struct Parser<'a> {
326 pub serialization: String,
327 pub base_url: Option<&'a Url>,
328 pub query_encoding_override: EncodingOverride<'a>,
329 pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
330 pub context: Context,
331 }
332
333 #[derive(PartialEq, Eq, Copy, Clone)]
334 pub enum Context {
335 UrlParser,
336 Setter,
337 PathSegmentSetter,
338 }
339
340 impl<'a> Parser<'a> {
log_violation(&self, v: SyntaxViolation)341 fn log_violation(&self, v: SyntaxViolation) {
342 if let Some(f) = self.violation_fn {
343 f(v)
344 }
345 }
346
log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool)347 fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
348 if let Some(f) = self.violation_fn {
349 if test() {
350 f(v)
351 }
352 }
353 }
354
for_setter(serialization: String) -> Parser<'a>355 pub fn for_setter(serialization: String) -> Parser<'a> {
356 Parser {
357 serialization,
358 base_url: None,
359 query_encoding_override: None,
360 violation_fn: None,
361 context: Context::Setter,
362 }
363 }
364
365 /// https://url.spec.whatwg.org/#concept-basic-url-parser
parse_url(mut self, input: &str) -> ParseResult<Url>366 pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
367 let input = Input::new_trim_c0_control_and_space(input, self.violation_fn);
368 if let Ok(remaining) = self.parse_scheme(input.clone()) {
369 return self.parse_with_scheme(remaining);
370 }
371
372 // No-scheme state
373 if let Some(base_url) = self.base_url {
374 if input.starts_with('#') {
375 self.fragment_only(base_url, input)
376 } else if base_url.cannot_be_a_base() {
377 Err(ParseError::RelativeUrlWithCannotBeABaseBase)
378 } else {
379 let scheme_type = SchemeType::from(base_url.scheme());
380 if scheme_type.is_file() {
381 self.parse_file(input, scheme_type, Some(base_url))
382 } else {
383 self.parse_relative(input, scheme_type, base_url)
384 }
385 }
386 } else {
387 Err(ParseError::RelativeUrlWithoutBase)
388 }
389 }
390
parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()>391 pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
392 if input.is_empty() || !input.starts_with(ascii_alpha) {
393 return Err(());
394 }
395 debug_assert!(self.serialization.is_empty());
396 while let Some(c) = input.next() {
397 match c {
398 'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
399 self.serialization.push(c.to_ascii_lowercase())
400 }
401 ':' => return Ok(input),
402 _ => {
403 self.serialization.clear();
404 return Err(());
405 }
406 }
407 }
408 // EOF before ':'
409 if self.context == Context::Setter {
410 Ok(input)
411 } else {
412 self.serialization.clear();
413 Err(())
414 }
415 }
416
parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url>417 fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
418 use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
419 let scheme_end = to_u32(self.serialization.len())?;
420 let scheme_type = SchemeType::from(&self.serialization);
421 self.serialization.push(':');
422 match scheme_type {
423 SchemeType::File => {
424 self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
425 let base_file_url = self.base_url.and_then(|base| {
426 if base.scheme() == "file" {
427 Some(base)
428 } else {
429 None
430 }
431 });
432 self.serialization.clear();
433 self.parse_file(input, scheme_type, base_file_url)
434 }
435 SchemeType::SpecialNotFile => {
436 // special relative or authority state
437 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
438 if let Some(base_url) = self.base_url {
439 if slashes_count < 2
440 && base_url.scheme() == &self.serialization[..scheme_end as usize]
441 {
442 // "Cannot-be-a-base" URLs only happen with "not special" schemes.
443 debug_assert!(!base_url.cannot_be_a_base());
444 self.serialization.clear();
445 return self.parse_relative(input, scheme_type, base_url);
446 }
447 }
448 // special authority slashes state
449 self.log_violation_if(ExpectedDoubleSlash, || {
450 input
451 .clone()
452 .take_while(|&c| matches!(c, '/' | '\\'))
453 .collect::<String>()
454 != "//"
455 });
456 self.after_double_slash(remaining, scheme_type, scheme_end)
457 }
458 SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
459 }
460 }
461
462 /// Scheme other than file, http, https, ws, ws, ftp.
parse_non_special( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>463 fn parse_non_special(
464 mut self,
465 input: Input<'_>,
466 scheme_type: SchemeType,
467 scheme_end: u32,
468 ) -> ParseResult<Url> {
469 // path or authority state (
470 if let Some(input) = input.split_prefix("//") {
471 return self.after_double_slash(input, scheme_type, scheme_end);
472 }
473 // Anarchist URL (no authority)
474 let path_start = to_u32(self.serialization.len())?;
475 let username_end = path_start;
476 let host_start = path_start;
477 let host_end = path_start;
478 let host = HostInternal::None;
479 let port = None;
480 let remaining = if let Some(input) = input.split_prefix('/') {
481 self.serialization.push('/');
482 self.parse_path(scheme_type, &mut false, path_start as usize, input)
483 } else {
484 self.parse_cannot_be_a_base_path(input)
485 };
486 self.with_query_and_fragment(
487 scheme_type,
488 scheme_end,
489 username_end,
490 host_start,
491 host_end,
492 host,
493 port,
494 path_start,
495 remaining,
496 )
497 }
498
parse_file( mut self, input: Input<'_>, scheme_type: SchemeType, base_file_url: Option<&Url>, ) -> ParseResult<Url>499 fn parse_file(
500 mut self,
501 input: Input<'_>,
502 scheme_type: SchemeType,
503 base_file_url: Option<&Url>,
504 ) -> ParseResult<Url> {
505 use crate::SyntaxViolation::Backslash;
506 // file state
507 debug_assert!(self.serialization.is_empty());
508 let (first_char, input_after_first_char) = input.split_first();
509 if matches!(first_char, Some('/') | Some('\\')) {
510 self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
511 // file slash state
512 let (next_char, input_after_next_char) = input_after_first_char.split_first();
513 if matches!(next_char, Some('/') | Some('\\')) {
514 self.log_violation_if(Backslash, || next_char == Some('\\'));
515 // file host state
516 self.serialization.push_str("file://");
517 let scheme_end = "file".len() as u32;
518 let host_start = "file://".len() as u32;
519 let (path_start, mut host, remaining) =
520 self.parse_file_host(input_after_next_char)?;
521 let mut host_end = to_u32(self.serialization.len())?;
522 let mut has_host = !matches!(host, HostInternal::None);
523 let remaining = if path_start {
524 self.parse_path_start(SchemeType::File, &mut has_host, remaining)
525 } else {
526 let path_start = self.serialization.len();
527 self.serialization.push('/');
528 self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
529 };
530
531 // For file URLs that have a host and whose path starts
532 // with the windows drive letter we just remove the host.
533 if !has_host {
534 self.serialization
535 .drain(host_start as usize..host_end as usize);
536 host_end = host_start;
537 host = HostInternal::None;
538 }
539 let (query_start, fragment_start) =
540 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
541 return Ok(Url {
542 serialization: self.serialization,
543 scheme_end,
544 username_end: host_start,
545 host_start,
546 host_end,
547 host,
548 port: None,
549 path_start: host_end,
550 query_start,
551 fragment_start,
552 });
553 } else {
554 self.serialization.push_str("file://");
555 let scheme_end = "file".len() as u32;
556 let host_start = "file://".len();
557 let mut host_end = host_start;
558 let mut host = HostInternal::None;
559 if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
560 if let Some(base_url) = base_file_url {
561 let first_segment = base_url.path_segments().unwrap().next().unwrap();
562 if is_normalized_windows_drive_letter(first_segment) {
563 self.serialization.push('/');
564 self.serialization.push_str(first_segment);
565 } else if let Some(host_str) = base_url.host_str() {
566 self.serialization.push_str(host_str);
567 host_end = self.serialization.len();
568 host = base_url.host;
569 }
570 }
571 }
572 // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
573 let parse_path_input = if let Some(c) = first_char {
574 if c == '/' || c == '\\' || c == '?' || c == '#' {
575 input
576 } else {
577 input_after_first_char
578 }
579 } else {
580 input_after_first_char
581 };
582
583 let remaining =
584 self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
585
586 let host_start = host_start as u32;
587
588 let (query_start, fragment_start) =
589 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
590
591 let host_end = host_end as u32;
592 return Ok(Url {
593 serialization: self.serialization,
594 scheme_end,
595 username_end: host_start,
596 host_start,
597 host_end,
598 host,
599 port: None,
600 path_start: host_end,
601 query_start,
602 fragment_start,
603 });
604 }
605 }
606 if let Some(base_url) = base_file_url {
607 match first_char {
608 None => {
609 // Copy everything except the fragment
610 let before_fragment = match base_url.fragment_start {
611 Some(i) => &base_url.serialization[..i as usize],
612 None => &*base_url.serialization,
613 };
614 self.serialization.push_str(before_fragment);
615 Ok(Url {
616 serialization: self.serialization,
617 fragment_start: None,
618 ..*base_url
619 })
620 }
621 Some('?') => {
622 // Copy everything up to the query string
623 let before_query = match (base_url.query_start, base_url.fragment_start) {
624 (None, None) => &*base_url.serialization,
625 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
626 };
627 self.serialization.push_str(before_query);
628 let (query_start, fragment_start) =
629 self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
630 Ok(Url {
631 serialization: self.serialization,
632 query_start,
633 fragment_start,
634 ..*base_url
635 })
636 }
637 Some('#') => self.fragment_only(base_url, input),
638 _ => {
639 if !starts_with_windows_drive_letter_segment(&input) {
640 let before_query = match (base_url.query_start, base_url.fragment_start) {
641 (None, None) => &*base_url.serialization,
642 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
643 };
644 self.serialization.push_str(before_query);
645 self.shorten_path(SchemeType::File, base_url.path_start as usize);
646 let remaining = self.parse_path(
647 SchemeType::File,
648 &mut true,
649 base_url.path_start as usize,
650 input,
651 );
652 self.with_query_and_fragment(
653 SchemeType::File,
654 base_url.scheme_end,
655 base_url.username_end,
656 base_url.host_start,
657 base_url.host_end,
658 base_url.host,
659 base_url.port,
660 base_url.path_start,
661 remaining,
662 )
663 } else {
664 self.serialization.push_str("file:///");
665 let scheme_end = "file".len() as u32;
666 let path_start = "file://".len();
667 let remaining =
668 self.parse_path(SchemeType::File, &mut false, path_start, input);
669 let (query_start, fragment_start) =
670 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
671 let path_start = path_start as u32;
672 Ok(Url {
673 serialization: self.serialization,
674 scheme_end,
675 username_end: path_start,
676 host_start: path_start,
677 host_end: path_start,
678 host: HostInternal::None,
679 port: None,
680 path_start,
681 query_start,
682 fragment_start,
683 })
684 }
685 }
686 }
687 } else {
688 self.serialization.push_str("file:///");
689 let scheme_end = "file".len() as u32;
690 let path_start = "file://".len();
691 let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
692 let (query_start, fragment_start) =
693 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
694 let path_start = path_start as u32;
695 Ok(Url {
696 serialization: self.serialization,
697 scheme_end,
698 username_end: path_start,
699 host_start: path_start,
700 host_end: path_start,
701 host: HostInternal::None,
702 port: None,
703 path_start,
704 query_start,
705 fragment_start,
706 })
707 }
708 }
709
parse_relative( mut self, input: Input<'_>, scheme_type: SchemeType, base_url: &Url, ) -> ParseResult<Url>710 fn parse_relative(
711 mut self,
712 input: Input<'_>,
713 scheme_type: SchemeType,
714 base_url: &Url,
715 ) -> ParseResult<Url> {
716 // relative state
717 debug_assert!(self.serialization.is_empty());
718 let (first_char, input_after_first_char) = input.split_first();
719 match first_char {
720 None => {
721 // Copy everything except the fragment
722 let before_fragment = match base_url.fragment_start {
723 Some(i) => &base_url.serialization[..i as usize],
724 None => &*base_url.serialization,
725 };
726 self.serialization.push_str(before_fragment);
727 Ok(Url {
728 serialization: self.serialization,
729 fragment_start: None,
730 ..*base_url
731 })
732 }
733 Some('?') => {
734 // Copy everything up to the query string
735 let before_query = match (base_url.query_start, base_url.fragment_start) {
736 (None, None) => &*base_url.serialization,
737 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
738 };
739 self.serialization.push_str(before_query);
740 let (query_start, fragment_start) =
741 self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
742 Ok(Url {
743 serialization: self.serialization,
744 query_start,
745 fragment_start,
746 ..*base_url
747 })
748 }
749 Some('#') => self.fragment_only(base_url, input),
750 Some('/') | Some('\\') => {
751 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
752 if slashes_count >= 2 {
753 self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
754 input
755 .clone()
756 .take_while(|&c| matches!(c, '/' | '\\'))
757 .collect::<String>()
758 != "//"
759 });
760 let scheme_end = base_url.scheme_end;
761 debug_assert!(base_url.byte_at(scheme_end) == b':');
762 self.serialization
763 .push_str(base_url.slice(..scheme_end + 1));
764 if let Some(after_prefix) = input.split_prefix("//") {
765 return self.after_double_slash(after_prefix, scheme_type, scheme_end);
766 }
767 return self.after_double_slash(remaining, scheme_type, scheme_end);
768 }
769 let path_start = base_url.path_start;
770 self.serialization.push_str(base_url.slice(..path_start));
771 self.serialization.push('/');
772 let remaining = self.parse_path(
773 scheme_type,
774 &mut true,
775 path_start as usize,
776 input_after_first_char,
777 );
778 self.with_query_and_fragment(
779 scheme_type,
780 base_url.scheme_end,
781 base_url.username_end,
782 base_url.host_start,
783 base_url.host_end,
784 base_url.host,
785 base_url.port,
786 base_url.path_start,
787 remaining,
788 )
789 }
790 _ => {
791 let before_query = match (base_url.query_start, base_url.fragment_start) {
792 (None, None) => &*base_url.serialization,
793 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
794 };
795 self.serialization.push_str(before_query);
796 // FIXME spec says just "remove last entry", not the "pop" algorithm
797 self.pop_path(scheme_type, base_url.path_start as usize);
798 // A special url always has a path.
799 // A path always starts with '/'
800 if self.serialization.len() == base_url.path_start as usize
801 && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
802 {
803 self.serialization.push('/');
804 }
805 let remaining = match input.split_first() {
806 (Some('/'), remaining) => self.parse_path(
807 scheme_type,
808 &mut true,
809 base_url.path_start as usize,
810 remaining,
811 ),
812 _ => {
813 self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
814 }
815 };
816 self.with_query_and_fragment(
817 scheme_type,
818 base_url.scheme_end,
819 base_url.username_end,
820 base_url.host_start,
821 base_url.host_end,
822 base_url.host,
823 base_url.port,
824 base_url.path_start,
825 remaining,
826 )
827 }
828 }
829 }
830
after_double_slash( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>831 fn after_double_slash(
832 mut self,
833 input: Input<'_>,
834 scheme_type: SchemeType,
835 scheme_end: u32,
836 ) -> ParseResult<Url> {
837 self.serialization.push('/');
838 self.serialization.push('/');
839 // authority state
840 let before_authority = self.serialization.len();
841 let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
842 let has_authority = before_authority != self.serialization.len();
843 // host state
844 let host_start = to_u32(self.serialization.len())?;
845 let (host_end, host, port, remaining) =
846 self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
847 if host == HostInternal::None && has_authority {
848 return Err(ParseError::EmptyHost);
849 }
850 // path state
851 let path_start = to_u32(self.serialization.len())?;
852 let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
853 self.with_query_and_fragment(
854 scheme_type,
855 scheme_end,
856 username_end,
857 host_start,
858 host_end,
859 host,
860 port,
861 path_start,
862 remaining,
863 )
864 }
865
866 /// Return (username_end, remaining)
parse_userinfo<'i>( &mut self, mut input: Input<'i>, scheme_type: SchemeType, ) -> ParseResult<(u32, Input<'i>)>867 fn parse_userinfo<'i>(
868 &mut self,
869 mut input: Input<'i>,
870 scheme_type: SchemeType,
871 ) -> ParseResult<(u32, Input<'i>)> {
872 let mut last_at = None;
873 let mut remaining = input.clone();
874 let mut char_count = 0;
875 while let Some(c) = remaining.next() {
876 match c {
877 '@' => {
878 if last_at.is_some() {
879 self.log_violation(SyntaxViolation::UnencodedAtSign)
880 } else {
881 self.log_violation(SyntaxViolation::EmbeddedCredentials)
882 }
883 last_at = Some((char_count, remaining.clone()))
884 }
885 '/' | '?' | '#' => break,
886 '\\' if scheme_type.is_special() => break,
887 _ => (),
888 }
889 char_count += 1;
890 }
891 let (mut userinfo_char_count, remaining) = match last_at {
892 None => return Ok((to_u32(self.serialization.len())?, input)),
893 Some((0, remaining)) => {
894 // Otherwise, if one of the following is true
895 // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
896 // url is special and c is U+005C (\)
897 // If @ flag is set and buffer is the empty string, validation error, return failure.
898 if let (Some(c), _) = remaining.split_first() {
899 if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
900 return Err(ParseError::EmptyHost);
901 }
902 }
903 return Ok((to_u32(self.serialization.len())?, remaining));
904 }
905 Some(x) => x,
906 };
907
908 let mut username_end = None;
909 let mut has_password = false;
910 let mut has_username = false;
911 while userinfo_char_count > 0 {
912 let (c, utf8_c) = input.next_utf8().unwrap();
913 userinfo_char_count -= 1;
914 if c == ':' && username_end.is_none() {
915 // Start parsing password
916 username_end = Some(to_u32(self.serialization.len())?);
917 // We don't add a colon if the password is empty
918 if userinfo_char_count > 0 {
919 self.serialization.push(':');
920 has_password = true;
921 }
922 } else {
923 if !has_password {
924 has_username = true;
925 }
926 self.check_url_code_point(c, &input);
927 self.serialization
928 .extend(utf8_percent_encode(utf8_c, USERINFO));
929 }
930 }
931 let username_end = match username_end {
932 Some(i) => i,
933 None => to_u32(self.serialization.len())?,
934 };
935 if has_username || has_password {
936 self.serialization.push('@');
937 }
938 Ok((username_end, remaining))
939 }
940
parse_host_and_port<'i>( &mut self, input: Input<'i>, scheme_end: u32, scheme_type: SchemeType, ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)>941 fn parse_host_and_port<'i>(
942 &mut self,
943 input: Input<'i>,
944 scheme_end: u32,
945 scheme_type: SchemeType,
946 ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
947 let (host, remaining) = Parser::parse_host(input, scheme_type)?;
948 write!(&mut self.serialization, "{}", host).unwrap();
949 let host_end = to_u32(self.serialization.len())?;
950 if let Host::Domain(h) = &host {
951 if h.is_empty() {
952 // Port with an empty host
953 if remaining.starts_with(":") {
954 return Err(ParseError::EmptyHost);
955 }
956 if scheme_type.is_special() {
957 return Err(ParseError::EmptyHost);
958 }
959 }
960 };
961
962 let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
963 let scheme = || default_port(&self.serialization[..scheme_end as usize]);
964 Parser::parse_port(remaining, scheme, self.context)?
965 } else {
966 (None, remaining)
967 };
968 if let Some(port) = port {
969 write!(&mut self.serialization, ":{}", port).unwrap()
970 }
971 Ok((host_end, host.into(), port, remaining))
972 }
973
parse_host( mut input: Input<'_>, scheme_type: SchemeType, ) -> ParseResult<(Host<String>, Input<'_>)>974 pub fn parse_host(
975 mut input: Input<'_>,
976 scheme_type: SchemeType,
977 ) -> ParseResult<(Host<String>, Input<'_>)> {
978 if scheme_type.is_file() {
979 return Parser::get_file_host(input);
980 }
981 // Undo the Input abstraction here to avoid allocating in the common case
982 // where the host part of the input does not contain any tab or newline
983 let input_str = input.chars.as_str();
984 let mut inside_square_brackets = false;
985 let mut has_ignored_chars = false;
986 let mut non_ignored_chars = 0;
987 let mut bytes = 0;
988 for c in input_str.chars() {
989 match c {
990 ':' if !inside_square_brackets => break,
991 '\\' if scheme_type.is_special() => break,
992 '/' | '?' | '#' => break,
993 '\t' | '\n' | '\r' => {
994 has_ignored_chars = true;
995 }
996 '[' => {
997 inside_square_brackets = true;
998 non_ignored_chars += 1
999 }
1000 ']' => {
1001 inside_square_brackets = false;
1002 non_ignored_chars += 1
1003 }
1004 _ => non_ignored_chars += 1,
1005 }
1006 bytes += c.len_utf8();
1007 }
1008 let replaced: String;
1009 let host_str;
1010 {
1011 let host_input = input.by_ref().take(non_ignored_chars);
1012 if has_ignored_chars {
1013 replaced = host_input.collect();
1014 host_str = &*replaced
1015 } else {
1016 for _ in host_input {}
1017 host_str = &input_str[..bytes]
1018 }
1019 }
1020 if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1021 return Err(ParseError::EmptyHost);
1022 }
1023 if !scheme_type.is_special() {
1024 let host = Host::parse_opaque(host_str)?;
1025 return Ok((host, input));
1026 }
1027 let host = Host::parse(host_str)?;
1028 Ok((host, input))
1029 }
1030
get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)>1031 fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1032 let (_, host_str, remaining) = Parser::file_host(input)?;
1033 let host = match Host::parse(&host_str)? {
1034 Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1035 host => host,
1036 };
1037 Ok((host, remaining))
1038 }
1039
parse_file_host<'i>( &mut self, input: Input<'i>, ) -> ParseResult<(bool, HostInternal, Input<'i>)>1040 fn parse_file_host<'i>(
1041 &mut self,
1042 input: Input<'i>,
1043 ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1044 let has_host;
1045 let (_, host_str, remaining) = Parser::file_host(input)?;
1046 let host = if host_str.is_empty() {
1047 has_host = false;
1048 HostInternal::None
1049 } else {
1050 match Host::parse(&host_str)? {
1051 Host::Domain(ref d) if d == "localhost" => {
1052 has_host = false;
1053 HostInternal::None
1054 }
1055 host => {
1056 write!(&mut self.serialization, "{}", host).unwrap();
1057 has_host = true;
1058 host.into()
1059 }
1060 }
1061 };
1062 Ok((has_host, host, remaining))
1063 }
1064
file_host(input: Input) -> ParseResult<(bool, String, Input)>1065 pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1066 // Undo the Input abstraction here to avoid allocating in the common case
1067 // where the host part of the input does not contain any tab or newline
1068 let input_str = input.chars.as_str();
1069 let mut has_ignored_chars = false;
1070 let mut non_ignored_chars = 0;
1071 let mut bytes = 0;
1072 for c in input_str.chars() {
1073 match c {
1074 '/' | '\\' | '?' | '#' => break,
1075 '\t' | '\n' | '\r' => has_ignored_chars = true,
1076 _ => non_ignored_chars += 1,
1077 }
1078 bytes += c.len_utf8();
1079 }
1080 let replaced: String;
1081 let host_str;
1082 let mut remaining = input.clone();
1083 {
1084 let host_input = remaining.by_ref().take(non_ignored_chars);
1085 if has_ignored_chars {
1086 replaced = host_input.collect();
1087 host_str = &*replaced
1088 } else {
1089 for _ in host_input {}
1090 host_str = &input_str[..bytes]
1091 }
1092 }
1093 if is_windows_drive_letter(host_str) {
1094 return Ok((false, "".to_string(), input));
1095 }
1096 Ok((true, host_str.to_string(), remaining))
1097 }
1098
parse_port<P>( mut input: Input<'_>, default_port: P, context: Context, ) -> ParseResult<(Option<u16>, Input<'_>)> where P: Fn() -> Option<u16>,1099 pub fn parse_port<P>(
1100 mut input: Input<'_>,
1101 default_port: P,
1102 context: Context,
1103 ) -> ParseResult<(Option<u16>, Input<'_>)>
1104 where
1105 P: Fn() -> Option<u16>,
1106 {
1107 let mut port: u32 = 0;
1108 let mut has_any_digit = false;
1109 while let (Some(c), remaining) = input.split_first() {
1110 if let Some(digit) = c.to_digit(10) {
1111 port = port * 10 + digit;
1112 if port > u16::MAX as u32 {
1113 return Err(ParseError::InvalidPort);
1114 }
1115 has_any_digit = true;
1116 } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1117 return Err(ParseError::InvalidPort);
1118 } else {
1119 break;
1120 }
1121 input = remaining;
1122 }
1123 let mut opt_port = Some(port as u16);
1124 if !has_any_digit || opt_port == default_port() {
1125 opt_port = None;
1126 }
1127 Ok((opt_port, input))
1128 }
1129
parse_path_start<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, input: Input<'i>, ) -> Input<'i>1130 pub fn parse_path_start<'i>(
1131 &mut self,
1132 scheme_type: SchemeType,
1133 has_host: &mut bool,
1134 input: Input<'i>,
1135 ) -> Input<'i> {
1136 let path_start = self.serialization.len();
1137 let (maybe_c, remaining) = input.split_first();
1138 // If url is special, then:
1139 if scheme_type.is_special() {
1140 if maybe_c == Some('\\') {
1141 // If c is U+005C (\), validation error.
1142 self.log_violation(SyntaxViolation::Backslash);
1143 }
1144 // A special URL always has a non-empty path.
1145 if !self.serialization.ends_with('/') {
1146 self.serialization.push('/');
1147 // We have already made sure the forward slash is present.
1148 if maybe_c == Some('/') || maybe_c == Some('\\') {
1149 return self.parse_path(scheme_type, has_host, path_start, remaining);
1150 }
1151 }
1152 return self.parse_path(scheme_type, has_host, path_start, input);
1153 } else if maybe_c == Some('?') || maybe_c == Some('#') {
1154 // Otherwise, if state override is not given and c is U+003F (?),
1155 // set url’s query to the empty string and state to query state.
1156 // Otherwise, if state override is not given and c is U+0023 (#),
1157 // set url’s fragment to the empty string and state to fragment state.
1158 // The query and path states will be handled by the caller.
1159 return input;
1160 }
1161
1162 if maybe_c.is_some() && maybe_c != Some('/') {
1163 self.serialization.push('/');
1164 }
1165 // Otherwise, if c is not the EOF code point:
1166 self.parse_path(scheme_type, has_host, path_start, input)
1167 }
1168
parse_path<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, path_start: usize, mut input: Input<'i>, ) -> Input<'i>1169 pub fn parse_path<'i>(
1170 &mut self,
1171 scheme_type: SchemeType,
1172 has_host: &mut bool,
1173 path_start: usize,
1174 mut input: Input<'i>,
1175 ) -> Input<'i> {
1176 // Relative path state
1177 loop {
1178 let mut segment_start = self.serialization.len();
1179 let mut ends_with_slash = false;
1180 loop {
1181 let input_before_c = input.clone();
1182 let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1183 x
1184 } else {
1185 break;
1186 };
1187 match c {
1188 '/' if self.context != Context::PathSegmentSetter => {
1189 self.serialization.push(c);
1190 ends_with_slash = true;
1191 break;
1192 }
1193 '\\' if self.context != Context::PathSegmentSetter
1194 && scheme_type.is_special() =>
1195 {
1196 self.log_violation(SyntaxViolation::Backslash);
1197 self.serialization.push('/');
1198 ends_with_slash = true;
1199 break;
1200 }
1201 '?' | '#' if self.context == Context::UrlParser => {
1202 input = input_before_c;
1203 break;
1204 }
1205 _ => {
1206 self.check_url_code_point(c, &input);
1207 if scheme_type.is_file()
1208 && self.serialization.len() > path_start
1209 && is_normalized_windows_drive_letter(
1210 &self.serialization[path_start + 1..],
1211 )
1212 {
1213 self.serialization.push('/');
1214 segment_start += 1;
1215 }
1216 if self.context == Context::PathSegmentSetter {
1217 if scheme_type.is_special() {
1218 self.serialization
1219 .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1220 } else {
1221 self.serialization
1222 .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1223 }
1224 } else {
1225 self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1226 }
1227 }
1228 }
1229 }
1230 let segment_before_slash = if ends_with_slash {
1231 &self.serialization[segment_start..self.serialization.len() - 1]
1232 } else {
1233 &self.serialization[segment_start..self.serialization.len()]
1234 };
1235 match segment_before_slash {
1236 // If buffer is a double-dot path segment, shorten url’s path,
1237 ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1238 | ".%2E" => {
1239 debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1240 self.serialization.truncate(segment_start);
1241 if self.serialization.ends_with('/')
1242 && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1243 {
1244 self.serialization.pop();
1245 }
1246 self.shorten_path(scheme_type, path_start);
1247
1248 // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1249 if ends_with_slash && !self.serialization.ends_with('/') {
1250 self.serialization.push('/');
1251 }
1252 }
1253 // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1254 // nor url is special and c is U+005C (\), append the empty string to url’s path.
1255 "." | "%2e" | "%2E" => {
1256 self.serialization.truncate(segment_start);
1257 if !self.serialization.ends_with('/') {
1258 self.serialization.push('/');
1259 }
1260 }
1261 _ => {
1262 // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1263 if scheme_type.is_file()
1264 && segment_start == path_start + 1
1265 && is_windows_drive_letter(segment_before_slash)
1266 {
1267 // Replace the second code point in buffer with U+003A (:).
1268 if let Some(c) = segment_before_slash.chars().next() {
1269 self.serialization.truncate(segment_start);
1270 self.serialization.push(c);
1271 self.serialization.push(':');
1272 if ends_with_slash {
1273 self.serialization.push('/');
1274 }
1275 }
1276 // If url’s host is neither the empty string nor null,
1277 // validation error, set url’s host to the empty string.
1278 if *has_host {
1279 self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1280 *has_host = false; // FIXME account for this in callers
1281 }
1282 }
1283 }
1284 }
1285 if !ends_with_slash {
1286 break;
1287 }
1288 }
1289 if scheme_type.is_file() {
1290 // while url’s path’s size is greater than 1
1291 // and url’s path[0] is the empty string,
1292 // validation error, remove the first item from url’s path.
1293 //FIXME: log violation
1294 let path = self.serialization.split_off(path_start);
1295 self.serialization.push('/');
1296 self.serialization.push_str(path.trim_start_matches('/'));
1297 }
1298
1299 input
1300 }
1301
last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool1302 fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1303 let url_before_segment = &serialization[..serialization.len() - 1];
1304 if let Some(segment_before_start) = url_before_segment.rfind('/') {
1305 // Do not remove the root slash
1306 segment_before_start >= path_start
1307 // Or a windows drive letter slash
1308 && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1309 } else {
1310 false
1311 }
1312 }
1313
1314 /// https://url.spec.whatwg.org/#shorten-a-urls-path
shorten_path(&mut self, scheme_type: SchemeType, path_start: usize)1315 fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1316 // If path is empty, then return.
1317 if self.serialization.len() == path_start {
1318 return;
1319 }
1320 // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1321 if scheme_type.is_file()
1322 && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1323 {
1324 return;
1325 }
1326 // Remove path’s last item.
1327 self.pop_path(scheme_type, path_start);
1328 }
1329
1330 /// https://url.spec.whatwg.org/#pop-a-urls-path
pop_path(&mut self, scheme_type: SchemeType, path_start: usize)1331 fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1332 if self.serialization.len() > path_start {
1333 let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1334 // + 1 since rfind returns the position before the slash.
1335 let segment_start = path_start + slash_position + 1;
1336 // Don’t pop a Windows drive letter
1337 if !(scheme_type.is_file()
1338 && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1339 {
1340 self.serialization.truncate(segment_start);
1341 }
1342 }
1343 }
1344
parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i>1345 pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1346 loop {
1347 let input_before_c = input.clone();
1348 match input.next_utf8() {
1349 Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1350 return input_before_c
1351 }
1352 Some((c, utf8_c)) => {
1353 self.check_url_code_point(c, &input);
1354 self.serialization
1355 .extend(utf8_percent_encode(utf8_c, CONTROLS));
1356 }
1357 None => return input,
1358 }
1359 }
1360 }
1361
1362 #[allow(clippy::too_many_arguments)]
with_query_and_fragment( mut self, scheme_type: SchemeType, scheme_end: u32, username_end: u32, host_start: u32, host_end: u32, host: HostInternal, port: Option<u16>, mut path_start: u32, remaining: Input<'_>, ) -> ParseResult<Url>1363 fn with_query_and_fragment(
1364 mut self,
1365 scheme_type: SchemeType,
1366 scheme_end: u32,
1367 username_end: u32,
1368 host_start: u32,
1369 host_end: u32,
1370 host: HostInternal,
1371 port: Option<u16>,
1372 mut path_start: u32,
1373 remaining: Input<'_>,
1374 ) -> ParseResult<Url> {
1375 // Special case for anarchist URL's with a leading empty path segment
1376 // This prevents web+demo:/.//not-a-host/ or web+demo:/path/..//not-a-host/,
1377 // when parsed and then serialized, from ending up as web+demo://not-a-host/
1378 // (they end up as web+demo:/.//not-a-host/).
1379 //
1380 // If url’s host is null, url does not have an opaque path,
1381 // url’s path’s size is greater than 1, and url’s path[0] is the empty string,
1382 // then append U+002F (/) followed by U+002E (.) to output.
1383 let scheme_end_as_usize = scheme_end as usize;
1384 let path_start_as_usize = path_start as usize;
1385 if path_start_as_usize == scheme_end_as_usize + 1 {
1386 // Anarchist URL
1387 if self.serialization[path_start_as_usize..].starts_with("//") {
1388 // Case 1: The base URL did not have an empty path segment, but the resulting one does
1389 // Insert the "/." prefix
1390 self.serialization.insert_str(path_start_as_usize, "/.");
1391 path_start += 2;
1392 }
1393 assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1394 } else if path_start_as_usize == scheme_end_as_usize + 3
1395 && &self.serialization[scheme_end_as_usize..path_start_as_usize] == ":/."
1396 {
1397 // Anarchist URL with leading empty path segment
1398 // The base URL has a "/." between the host and the path
1399 assert_eq!(self.serialization.as_bytes()[path_start_as_usize], b'/');
1400 if self
1401 .serialization
1402 .as_bytes()
1403 .get(path_start_as_usize + 1)
1404 .copied()
1405 != Some(b'/')
1406 {
1407 // Case 2: The base URL had an empty path segment, but the resulting one does not
1408 // Remove the "/." prefix
1409 self.serialization
1410 .replace_range(scheme_end_as_usize..path_start_as_usize, ":");
1411 path_start -= 2;
1412 }
1413 assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1414 }
1415
1416 let (query_start, fragment_start) =
1417 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1418 Ok(Url {
1419 serialization: self.serialization,
1420 scheme_end,
1421 username_end,
1422 host_start,
1423 host_end,
1424 host,
1425 port,
1426 path_start,
1427 query_start,
1428 fragment_start,
1429 })
1430 }
1431
1432 /// Return (query_start, fragment_start)
parse_query_and_fragment( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'_>, ) -> ParseResult<(Option<u32>, Option<u32>)>1433 fn parse_query_and_fragment(
1434 &mut self,
1435 scheme_type: SchemeType,
1436 scheme_end: u32,
1437 mut input: Input<'_>,
1438 ) -> ParseResult<(Option<u32>, Option<u32>)> {
1439 let mut query_start = None;
1440 match input.next() {
1441 Some('#') => {}
1442 Some('?') => {
1443 query_start = Some(to_u32(self.serialization.len())?);
1444 self.serialization.push('?');
1445 let remaining = self.parse_query(scheme_type, scheme_end, input);
1446 if let Some(remaining) = remaining {
1447 input = remaining
1448 } else {
1449 return Ok((query_start, None));
1450 }
1451 }
1452 None => return Ok((None, None)),
1453 _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1454 }
1455
1456 let fragment_start = to_u32(self.serialization.len())?;
1457 self.serialization.push('#');
1458 self.parse_fragment(input);
1459 Ok((query_start, Some(fragment_start)))
1460 }
1461
parse_query<'i>( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'i>, ) -> Option<Input<'i>>1462 pub fn parse_query<'i>(
1463 &mut self,
1464 scheme_type: SchemeType,
1465 scheme_end: u32,
1466 mut input: Input<'i>,
1467 ) -> Option<Input<'i>> {
1468 let len = input.chars.as_str().len();
1469 let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1470 let mut remaining = None;
1471 while let Some(c) = input.next() {
1472 if c == '#' && self.context == Context::UrlParser {
1473 remaining = Some(input);
1474 break;
1475 } else {
1476 self.check_url_code_point(c, &input);
1477 query.push(c);
1478 }
1479 }
1480
1481 let encoding = match &self.serialization[..scheme_end as usize] {
1482 "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1483 _ => None,
1484 };
1485 let query_bytes = if let Some(o) = encoding {
1486 o(&query)
1487 } else {
1488 query.as_bytes().into()
1489 };
1490 let set = if scheme_type.is_special() {
1491 SPECIAL_QUERY
1492 } else {
1493 QUERY
1494 };
1495 self.serialization.extend(percent_encode(&query_bytes, set));
1496 remaining
1497 }
1498
fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url>1499 fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1500 let before_fragment = match base_url.fragment_start {
1501 Some(i) => base_url.slice(..i),
1502 None => &*base_url.serialization,
1503 };
1504 debug_assert!(self.serialization.is_empty());
1505 self.serialization
1506 .reserve(before_fragment.len() + input.chars.as_str().len());
1507 self.serialization.push_str(before_fragment);
1508 self.serialization.push('#');
1509 let next = input.next();
1510 debug_assert!(next == Some('#'));
1511 self.parse_fragment(input);
1512 Ok(Url {
1513 serialization: self.serialization,
1514 fragment_start: Some(to_u32(before_fragment.len())?),
1515 ..*base_url
1516 })
1517 }
1518
parse_fragment(&mut self, mut input: Input<'_>)1519 pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1520 while let Some((c, utf8_c)) = input.next_utf8() {
1521 if c == '\0' {
1522 self.log_violation(SyntaxViolation::NullInFragment)
1523 } else {
1524 self.check_url_code_point(c, &input);
1525 }
1526 self.serialization
1527 .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1528 }
1529 }
1530
check_url_code_point(&self, c: char, input: &Input<'_>)1531 fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1532 if let Some(vfn) = self.violation_fn {
1533 if c == '%' {
1534 let mut input = input.clone();
1535 if !matches!((input.next(), input.next()), (Some(a), Some(b))
1536 if a.is_ascii_hexdigit() && b.is_ascii_hexdigit())
1537 {
1538 vfn(SyntaxViolation::PercentDecode)
1539 }
1540 } else if !is_url_code_point(c) {
1541 vfn(SyntaxViolation::NonUrlCodePoint)
1542 }
1543 }
1544 }
1545 }
1546
1547 // Non URL code points:
1548 // U+0000 to U+0020 (space)
1549 // " # % < > [ \ ] ^ ` { | }
1550 // U+007F to U+009F
1551 // surrogates
1552 // U+FDD0 to U+FDEF
1553 // Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1554 #[inline]
is_url_code_point(c: char) -> bool1555 fn is_url_code_point(c: char) -> bool {
1556 matches!(c,
1557 'a'..='z' |
1558 'A'..='Z' |
1559 '0'..='9' |
1560 '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1561 '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1562 '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1563 '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1564 '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1565 '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1566 '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1567 '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1568 '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1569 '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1570 '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1571 }
1572
1573 /// https://url.spec.whatwg.org/#c0-controls-and-space
1574 #[inline]
c0_control_or_space(ch: char) -> bool1575 fn c0_control_or_space(ch: char) -> bool {
1576 ch <= ' ' // U+0000 to U+0020
1577 }
1578
1579 /// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1580 #[inline]
ascii_tab_or_new_line(ch: char) -> bool1581 fn ascii_tab_or_new_line(ch: char) -> bool {
1582 matches!(ch, '\t' | '\r' | '\n')
1583 }
1584
1585 /// https://url.spec.whatwg.org/#ascii-alpha
1586 #[inline]
ascii_alpha(ch: char) -> bool1587 pub fn ascii_alpha(ch: char) -> bool {
1588 ch.is_ascii_alphabetic()
1589 }
1590
1591 #[inline]
to_u32(i: usize) -> ParseResult<u32>1592 pub fn to_u32(i: usize) -> ParseResult<u32> {
1593 if i <= u32::MAX as usize {
1594 Ok(i as u32)
1595 } else {
1596 Err(ParseError::Overflow)
1597 }
1598 }
1599
is_normalized_windows_drive_letter(segment: &str) -> bool1600 fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1601 is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1602 }
1603
1604 /// Whether the scheme is file:, the path has a single segment, and that segment
1605 /// is a Windows drive letter
1606 #[inline]
is_windows_drive_letter(segment: &str) -> bool1607 pub fn is_windows_drive_letter(segment: &str) -> bool {
1608 segment.len() == 2 && starts_with_windows_drive_letter(segment)
1609 }
1610
1611 /// Whether path starts with a root slash
1612 /// and a windows drive letter eg: "/c:" or "/a:/"
path_starts_with_windows_drive_letter(s: &str) -> bool1613 fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1614 if let Some(c) = s.as_bytes().first() {
1615 matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1616 } else {
1617 false
1618 }
1619 }
1620
starts_with_windows_drive_letter(s: &str) -> bool1621 fn starts_with_windows_drive_letter(s: &str) -> bool {
1622 s.len() >= 2
1623 && ascii_alpha(s.as_bytes()[0] as char)
1624 && matches!(s.as_bytes()[1], b':' | b'|')
1625 && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1626 }
1627
1628 /// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool1629 fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1630 let mut input = input.clone();
1631 match (input.next(), input.next(), input.next()) {
1632 // its first two code points are a Windows drive letter
1633 // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1634 (Some(a), Some(b), Some(c))
1635 if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1636 {
1637 true
1638 }
1639 // its first two code points are a Windows drive letter
1640 // its length is 2
1641 (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1642 _ => false,
1643 }
1644 }
1645