1 // Copyright 2013-2016 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 use std::error::Error;
10 use std::fmt::{self, Formatter, Write};
11 use std::str;
12
13 use crate::host::{Host, HostInternal};
14 use crate::Url;
15 use form_urlencoded::EncodingOverride;
16 use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17
18 /// https://url.spec.whatwg.org/#fragment-percent-encode-set
19 const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
20
21 /// https://url.spec.whatwg.org/#path-percent-encode-set
22 const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
23
24 /// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25 pub(crate) const USERINFO: &AsciiSet = &PATH
26 .add(b'/')
27 .add(b':')
28 .add(b';')
29 .add(b'=')
30 .add(b'@')
31 .add(b'[')
32 .add(b'\\')
33 .add(b']')
34 .add(b'^')
35 .add(b'|');
36
37 pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
38
39 // The backslash (\) character is treated as a path separator in special URLs
40 // so it needs to be additionally escaped in that case.
41 pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
42
43 // https://url.spec.whatwg.org/#query-state
44 const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
45 const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
46
47 pub type ParseResult<T> = Result<T, ParseError>;
48
49 macro_rules! simple_enum_error {
50 ($($name: ident => $description: expr,)+) => {
51 /// Errors that can occur during parsing.
52 ///
53 /// This may be extended in the future so exhaustive matching is
54 /// discouraged with an unused variant.
55 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
56 #[non_exhaustive]
57 pub enum ParseError {
58 $(
59 $name,
60 )+
61 }
62
63 impl fmt::Display for ParseError {
64 fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
65 match *self {
66 $(
67 ParseError::$name => fmt.write_str($description),
68 )+
69 }
70 }
71 }
72 }
73 }
74
75 impl Error for ParseError {}
76
77 simple_enum_error! {
78 EmptyHost => "empty host",
79 IdnaError => "invalid international domain name",
80 InvalidPort => "invalid port number",
81 InvalidIpv4Address => "invalid IPv4 address",
82 InvalidIpv6Address => "invalid IPv6 address",
83 InvalidDomainCharacter => "invalid domain character",
84 RelativeUrlWithoutBase => "relative URL without a base",
85 RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
86 SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
87 Overflow => "URLs more than 4 GB are not supported",
88 }
89
90 impl From<::idna::Errors> for ParseError {
from(_: ::idna::Errors) -> ParseError91 fn from(_: ::idna::Errors) -> ParseError {
92 ParseError::IdnaError
93 }
94 }
95
96 macro_rules! syntax_violation_enum {
97 ($($name: ident => $description: expr,)+) => {
98 /// Non-fatal syntax violations that can occur during parsing.
99 ///
100 /// This may be extended in the future so exhaustive matching is
101 /// discouraged with an unused variant.
102 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
103 #[non_exhaustive]
104 pub enum SyntaxViolation {
105 $(
106 $name,
107 )+
108 }
109
110 impl SyntaxViolation {
111 pub fn description(&self) -> &'static str {
112 match *self {
113 $(
114 SyntaxViolation::$name => $description,
115 )+
116 }
117 }
118 }
119 }
120 }
121
122 syntax_violation_enum! {
123 Backslash => "backslash",
124 C0SpaceIgnored =>
125 "leading or trailing control or space character are ignored in URLs",
126 EmbeddedCredentials =>
127 "embedding authentication information (username or password) \
128 in an URL is not recommended",
129 ExpectedDoubleSlash => "expected //",
130 ExpectedFileDoubleSlash => "expected // after file:",
131 FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
132 NonUrlCodePoint => "non-URL code point",
133 NullInFragment => "NULL characters are ignored in URL fragment identifiers",
134 PercentDecode => "expected 2 hex digits after %",
135 TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
136 UnencodedAtSign => "unencoded @ sign in username or password",
137 }
138
139 impl fmt::Display for SyntaxViolation {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result140 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
141 fmt::Display::fmt(self.description(), f)
142 }
143 }
144
145 #[derive(Copy, Clone, PartialEq, Eq)]
146 pub enum SchemeType {
147 File,
148 SpecialNotFile,
149 NotSpecial,
150 }
151
152 impl SchemeType {
is_special(&self) -> bool153 pub fn is_special(&self) -> bool {
154 !matches!(*self, SchemeType::NotSpecial)
155 }
156
is_file(&self) -> bool157 pub fn is_file(&self) -> bool {
158 matches!(*self, SchemeType::File)
159 }
160
from(s: &str) -> Self161 pub fn from(s: &str) -> Self {
162 match s {
163 "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
164 "file" => SchemeType::File,
165 _ => SchemeType::NotSpecial,
166 }
167 }
168 }
169
default_port(scheme: &str) -> Option<u16>170 pub fn default_port(scheme: &str) -> Option<u16> {
171 match scheme {
172 "http" | "ws" => Some(80),
173 "https" | "wss" => Some(443),
174 "ftp" => Some(21),
175 _ => None,
176 }
177 }
178
179 #[derive(Clone)]
180 pub struct Input<'i> {
181 chars: str::Chars<'i>,
182 }
183
184 impl<'i> Input<'i> {
new(input: &'i str) -> Self185 pub fn new(input: &'i str) -> Self {
186 Input::with_log(input, None)
187 }
188
no_trim(input: &'i str) -> Self189 pub fn no_trim(input: &'i str) -> Self {
190 Input {
191 chars: input.chars(),
192 }
193 }
194
trim_tab_and_newlines( original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>, ) -> Self195 pub fn trim_tab_and_newlines(
196 original_input: &'i str,
197 vfn: Option<&dyn Fn(SyntaxViolation)>,
198 ) -> Self {
199 let input = original_input.trim_matches(ascii_tab_or_new_line);
200 if let Some(vfn) = vfn {
201 if input.len() < original_input.len() {
202 vfn(SyntaxViolation::C0SpaceIgnored)
203 }
204 if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
205 vfn(SyntaxViolation::TabOrNewlineIgnored)
206 }
207 }
208 Input {
209 chars: input.chars(),
210 }
211 }
212
with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self213 pub fn with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self {
214 let input = original_input.trim_matches(c0_control_or_space);
215 if let Some(vfn) = vfn {
216 if input.len() < original_input.len() {
217 vfn(SyntaxViolation::C0SpaceIgnored)
218 }
219 if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
220 vfn(SyntaxViolation::TabOrNewlineIgnored)
221 }
222 }
223 Input {
224 chars: input.chars(),
225 }
226 }
227
228 #[inline]
is_empty(&self) -> bool229 pub fn is_empty(&self) -> bool {
230 self.clone().next().is_none()
231 }
232
233 #[inline]
starts_with<P: Pattern>(&self, p: P) -> bool234 fn starts_with<P: Pattern>(&self, p: P) -> bool {
235 p.split_prefix(&mut self.clone())
236 }
237
238 #[inline]
split_prefix<P: Pattern>(&self, p: P) -> Option<Self>239 pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
240 let mut remaining = self.clone();
241 if p.split_prefix(&mut remaining) {
242 Some(remaining)
243 } else {
244 None
245 }
246 }
247
248 #[inline]
split_first(&self) -> (Option<char>, Self)249 fn split_first(&self) -> (Option<char>, Self) {
250 let mut remaining = self.clone();
251 (remaining.next(), remaining)
252 }
253
254 #[inline]
count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self)255 fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
256 let mut count = 0;
257 let mut remaining = self.clone();
258 loop {
259 let mut input = remaining.clone();
260 if matches!(input.next(), Some(c) if f(c)) {
261 remaining = input;
262 count += 1;
263 } else {
264 return (count, remaining);
265 }
266 }
267 }
268
269 #[inline]
next_utf8(&mut self) -> Option<(char, &'i str)>270 fn next_utf8(&mut self) -> Option<(char, &'i str)> {
271 loop {
272 let utf8 = self.chars.as_str();
273 match self.chars.next() {
274 Some(c) => {
275 if !matches!(c, '\t' | '\n' | '\r') {
276 return Some((c, &utf8[..c.len_utf8()]));
277 }
278 }
279 None => return None,
280 }
281 }
282 }
283 }
284
285 pub trait Pattern {
split_prefix(self, input: &mut Input) -> bool286 fn split_prefix(self, input: &mut Input) -> bool;
287 }
288
289 impl Pattern for char {
split_prefix(self, input: &mut Input) -> bool290 fn split_prefix(self, input: &mut Input) -> bool {
291 input.next() == Some(self)
292 }
293 }
294
295 impl<'a> Pattern for &'a str {
split_prefix(self, input: &mut Input) -> bool296 fn split_prefix(self, input: &mut Input) -> bool {
297 for c in self.chars() {
298 if input.next() != Some(c) {
299 return false;
300 }
301 }
302 true
303 }
304 }
305
306 impl<F: FnMut(char) -> bool> Pattern for F {
split_prefix(self, input: &mut Input) -> bool307 fn split_prefix(self, input: &mut Input) -> bool {
308 input.next().map_or(false, self)
309 }
310 }
311
312 impl<'i> Iterator for Input<'i> {
313 type Item = char;
next(&mut self) -> Option<char>314 fn next(&mut self) -> Option<char> {
315 self.chars
316 .by_ref()
317 .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
318 }
319 }
320
321 pub struct Parser<'a> {
322 pub serialization: String,
323 pub base_url: Option<&'a Url>,
324 pub query_encoding_override: EncodingOverride<'a>,
325 pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
326 pub context: Context,
327 }
328
329 #[derive(PartialEq, Eq, Copy, Clone)]
330 pub enum Context {
331 UrlParser,
332 Setter,
333 PathSegmentSetter,
334 }
335
336 impl<'a> Parser<'a> {
log_violation(&self, v: SyntaxViolation)337 fn log_violation(&self, v: SyntaxViolation) {
338 if let Some(f) = self.violation_fn {
339 f(v)
340 }
341 }
342
log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool)343 fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
344 if let Some(f) = self.violation_fn {
345 if test() {
346 f(v)
347 }
348 }
349 }
350
for_setter(serialization: String) -> Parser<'a>351 pub fn for_setter(serialization: String) -> Parser<'a> {
352 Parser {
353 serialization,
354 base_url: None,
355 query_encoding_override: None,
356 violation_fn: None,
357 context: Context::Setter,
358 }
359 }
360
361 /// https://url.spec.whatwg.org/#concept-basic-url-parser
parse_url(mut self, input: &str) -> ParseResult<Url>362 pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
363 let input = Input::with_log(input, self.violation_fn);
364 if let Ok(remaining) = self.parse_scheme(input.clone()) {
365 return self.parse_with_scheme(remaining);
366 }
367
368 // No-scheme state
369 if let Some(base_url) = self.base_url {
370 if input.starts_with('#') {
371 self.fragment_only(base_url, input)
372 } else if base_url.cannot_be_a_base() {
373 Err(ParseError::RelativeUrlWithCannotBeABaseBase)
374 } else {
375 let scheme_type = SchemeType::from(base_url.scheme());
376 if scheme_type.is_file() {
377 self.parse_file(input, scheme_type, Some(base_url))
378 } else {
379 self.parse_relative(input, scheme_type, base_url)
380 }
381 }
382 } else {
383 Err(ParseError::RelativeUrlWithoutBase)
384 }
385 }
386
parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()>387 pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
388 if input.is_empty() || !input.starts_with(ascii_alpha) {
389 return Err(());
390 }
391 debug_assert!(self.serialization.is_empty());
392 while let Some(c) = input.next() {
393 match c {
394 'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
395 self.serialization.push(c.to_ascii_lowercase())
396 }
397 ':' => return Ok(input),
398 _ => {
399 self.serialization.clear();
400 return Err(());
401 }
402 }
403 }
404 // EOF before ':'
405 if self.context == Context::Setter {
406 Ok(input)
407 } else {
408 self.serialization.clear();
409 Err(())
410 }
411 }
412
parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url>413 fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
414 use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
415 let scheme_end = to_u32(self.serialization.len())?;
416 let scheme_type = SchemeType::from(&self.serialization);
417 self.serialization.push(':');
418 match scheme_type {
419 SchemeType::File => {
420 self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
421 let base_file_url = self.base_url.and_then(|base| {
422 if base.scheme() == "file" {
423 Some(base)
424 } else {
425 None
426 }
427 });
428 self.serialization.clear();
429 self.parse_file(input, scheme_type, base_file_url)
430 }
431 SchemeType::SpecialNotFile => {
432 // special relative or authority state
433 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
434 if let Some(base_url) = self.base_url {
435 if slashes_count < 2
436 && base_url.scheme() == &self.serialization[..scheme_end as usize]
437 {
438 // "Cannot-be-a-base" URLs only happen with "not special" schemes.
439 debug_assert!(!base_url.cannot_be_a_base());
440 self.serialization.clear();
441 return self.parse_relative(input, scheme_type, base_url);
442 }
443 }
444 // special authority slashes state
445 self.log_violation_if(ExpectedDoubleSlash, || {
446 input
447 .clone()
448 .take_while(|&c| matches!(c, '/' | '\\'))
449 .collect::<String>()
450 != "//"
451 });
452 self.after_double_slash(remaining, scheme_type, scheme_end)
453 }
454 SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
455 }
456 }
457
458 /// Scheme other than file, http, https, ws, ws, ftp.
parse_non_special( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>459 fn parse_non_special(
460 mut self,
461 input: Input<'_>,
462 scheme_type: SchemeType,
463 scheme_end: u32,
464 ) -> ParseResult<Url> {
465 // path or authority state (
466 if let Some(input) = input.split_prefix("//") {
467 return self.after_double_slash(input, scheme_type, scheme_end);
468 }
469 // Anarchist URL (no authority)
470 let path_start = to_u32(self.serialization.len())?;
471 let username_end = path_start;
472 let host_start = path_start;
473 let host_end = path_start;
474 let host = HostInternal::None;
475 let port = None;
476 let remaining = if let Some(input) = input.split_prefix('/') {
477 let path_start = self.serialization.len();
478 self.serialization.push('/');
479 self.parse_path(scheme_type, &mut false, path_start, input)
480 } else {
481 self.parse_cannot_be_a_base_path(input)
482 };
483 self.with_query_and_fragment(
484 scheme_type,
485 scheme_end,
486 username_end,
487 host_start,
488 host_end,
489 host,
490 port,
491 path_start,
492 remaining,
493 )
494 }
495
parse_file( mut self, input: Input<'_>, scheme_type: SchemeType, base_file_url: Option<&Url>, ) -> ParseResult<Url>496 fn parse_file(
497 mut self,
498 input: Input<'_>,
499 scheme_type: SchemeType,
500 base_file_url: Option<&Url>,
501 ) -> ParseResult<Url> {
502 use crate::SyntaxViolation::Backslash;
503 // file state
504 debug_assert!(self.serialization.is_empty());
505 let (first_char, input_after_first_char) = input.split_first();
506 if matches!(first_char, Some('/') | Some('\\')) {
507 self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
508 // file slash state
509 let (next_char, input_after_next_char) = input_after_first_char.split_first();
510 if matches!(next_char, Some('/') | Some('\\')) {
511 self.log_violation_if(Backslash, || next_char == Some('\\'));
512 // file host state
513 self.serialization.push_str("file://");
514 let scheme_end = "file".len() as u32;
515 let host_start = "file://".len() as u32;
516 let (path_start, mut host, remaining) =
517 self.parse_file_host(input_after_next_char)?;
518 let mut host_end = to_u32(self.serialization.len())?;
519 let mut has_host = !matches!(host, HostInternal::None);
520 let remaining = if path_start {
521 self.parse_path_start(SchemeType::File, &mut has_host, remaining)
522 } else {
523 let path_start = self.serialization.len();
524 self.serialization.push('/');
525 self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
526 };
527
528 // For file URLs that have a host and whose path starts
529 // with the windows drive letter we just remove the host.
530 if !has_host {
531 self.serialization
532 .drain(host_start as usize..host_end as usize);
533 host_end = host_start;
534 host = HostInternal::None;
535 }
536 let (query_start, fragment_start) =
537 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
538 return Ok(Url {
539 serialization: self.serialization,
540 scheme_end,
541 username_end: host_start,
542 host_start,
543 host_end,
544 host,
545 port: None,
546 path_start: host_end,
547 query_start,
548 fragment_start,
549 });
550 } else {
551 self.serialization.push_str("file://");
552 let scheme_end = "file".len() as u32;
553 let host_start = "file://".len();
554 let mut host_end = host_start;
555 let mut host = HostInternal::None;
556 if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
557 if let Some(base_url) = base_file_url {
558 let first_segment = base_url.path_segments().unwrap().next().unwrap();
559 if is_normalized_windows_drive_letter(first_segment) {
560 self.serialization.push('/');
561 self.serialization.push_str(first_segment);
562 } else if let Some(host_str) = base_url.host_str() {
563 self.serialization.push_str(host_str);
564 host_end = self.serialization.len();
565 host = base_url.host;
566 }
567 }
568 }
569 // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
570 let parse_path_input = if let Some(c) = first_char {
571 if c == '/' || c == '\\' || c == '?' || c == '#' {
572 input
573 } else {
574 input_after_first_char
575 }
576 } else {
577 input_after_first_char
578 };
579
580 let remaining =
581 self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
582
583 let host_start = host_start as u32;
584
585 let (query_start, fragment_start) =
586 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
587
588 let host_end = host_end as u32;
589 return Ok(Url {
590 serialization: self.serialization,
591 scheme_end,
592 username_end: host_start,
593 host_start,
594 host_end,
595 host,
596 port: None,
597 path_start: host_end,
598 query_start,
599 fragment_start,
600 });
601 }
602 }
603 if let Some(base_url) = base_file_url {
604 match first_char {
605 None => {
606 // Copy everything except the fragment
607 let before_fragment = match base_url.fragment_start {
608 Some(i) => &base_url.serialization[..i as usize],
609 None => &*base_url.serialization,
610 };
611 self.serialization.push_str(before_fragment);
612 Ok(Url {
613 serialization: self.serialization,
614 fragment_start: None,
615 ..*base_url
616 })
617 }
618 Some('?') => {
619 // Copy everything up to the query string
620 let before_query = match (base_url.query_start, base_url.fragment_start) {
621 (None, None) => &*base_url.serialization,
622 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
623 };
624 self.serialization.push_str(before_query);
625 let (query_start, fragment_start) =
626 self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
627 Ok(Url {
628 serialization: self.serialization,
629 query_start,
630 fragment_start,
631 ..*base_url
632 })
633 }
634 Some('#') => self.fragment_only(base_url, input),
635 _ => {
636 if !starts_with_windows_drive_letter_segment(&input) {
637 let before_query = match (base_url.query_start, base_url.fragment_start) {
638 (None, None) => &*base_url.serialization,
639 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
640 };
641 self.serialization.push_str(before_query);
642 self.shorten_path(SchemeType::File, base_url.path_start as usize);
643 let remaining = self.parse_path(
644 SchemeType::File,
645 &mut true,
646 base_url.path_start as usize,
647 input,
648 );
649 self.with_query_and_fragment(
650 SchemeType::File,
651 base_url.scheme_end,
652 base_url.username_end,
653 base_url.host_start,
654 base_url.host_end,
655 base_url.host,
656 base_url.port,
657 base_url.path_start,
658 remaining,
659 )
660 } else {
661 self.serialization.push_str("file:///");
662 let scheme_end = "file".len() as u32;
663 let path_start = "file://".len();
664 let remaining =
665 self.parse_path(SchemeType::File, &mut false, path_start, input);
666 let (query_start, fragment_start) =
667 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
668 let path_start = path_start as u32;
669 Ok(Url {
670 serialization: self.serialization,
671 scheme_end,
672 username_end: path_start,
673 host_start: path_start,
674 host_end: path_start,
675 host: HostInternal::None,
676 port: None,
677 path_start,
678 query_start,
679 fragment_start,
680 })
681 }
682 }
683 }
684 } else {
685 self.serialization.push_str("file:///");
686 let scheme_end = "file".len() as u32;
687 let path_start = "file://".len();
688 let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
689 let (query_start, fragment_start) =
690 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
691 let path_start = path_start as u32;
692 Ok(Url {
693 serialization: self.serialization,
694 scheme_end,
695 username_end: path_start,
696 host_start: path_start,
697 host_end: path_start,
698 host: HostInternal::None,
699 port: None,
700 path_start,
701 query_start,
702 fragment_start,
703 })
704 }
705 }
706
parse_relative( mut self, input: Input<'_>, scheme_type: SchemeType, base_url: &Url, ) -> ParseResult<Url>707 fn parse_relative(
708 mut self,
709 input: Input<'_>,
710 scheme_type: SchemeType,
711 base_url: &Url,
712 ) -> ParseResult<Url> {
713 // relative state
714 debug_assert!(self.serialization.is_empty());
715 let (first_char, input_after_first_char) = input.split_first();
716 match first_char {
717 None => {
718 // Copy everything except the fragment
719 let before_fragment = match base_url.fragment_start {
720 Some(i) => &base_url.serialization[..i as usize],
721 None => &*base_url.serialization,
722 };
723 self.serialization.push_str(before_fragment);
724 Ok(Url {
725 serialization: self.serialization,
726 fragment_start: None,
727 ..*base_url
728 })
729 }
730 Some('?') => {
731 // Copy everything up to the query string
732 let before_query = match (base_url.query_start, base_url.fragment_start) {
733 (None, None) => &*base_url.serialization,
734 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
735 };
736 self.serialization.push_str(before_query);
737 let (query_start, fragment_start) =
738 self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
739 Ok(Url {
740 serialization: self.serialization,
741 query_start,
742 fragment_start,
743 ..*base_url
744 })
745 }
746 Some('#') => self.fragment_only(base_url, input),
747 Some('/') | Some('\\') => {
748 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
749 if slashes_count >= 2 {
750 self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
751 input
752 .clone()
753 .take_while(|&c| matches!(c, '/' | '\\'))
754 .collect::<String>()
755 != "//"
756 });
757 let scheme_end = base_url.scheme_end;
758 debug_assert!(base_url.byte_at(scheme_end) == b':');
759 self.serialization
760 .push_str(base_url.slice(..scheme_end + 1));
761 if let Some(after_prefix) = input.split_prefix("//") {
762 return self.after_double_slash(after_prefix, scheme_type, scheme_end);
763 }
764 return self.after_double_slash(remaining, scheme_type, scheme_end);
765 }
766 let path_start = base_url.path_start;
767 self.serialization.push_str(base_url.slice(..path_start));
768 self.serialization.push('/');
769 let remaining = self.parse_path(
770 scheme_type,
771 &mut true,
772 path_start as usize,
773 input_after_first_char,
774 );
775 self.with_query_and_fragment(
776 scheme_type,
777 base_url.scheme_end,
778 base_url.username_end,
779 base_url.host_start,
780 base_url.host_end,
781 base_url.host,
782 base_url.port,
783 base_url.path_start,
784 remaining,
785 )
786 }
787 _ => {
788 let before_query = match (base_url.query_start, base_url.fragment_start) {
789 (None, None) => &*base_url.serialization,
790 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
791 };
792 self.serialization.push_str(before_query);
793 // FIXME spec says just "remove last entry", not the "pop" algorithm
794 self.pop_path(scheme_type, base_url.path_start as usize);
795 // A special url always has a path.
796 // A path always starts with '/'
797 if self.serialization.len() == base_url.path_start as usize
798 && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
799 {
800 self.serialization.push('/');
801 }
802 let remaining = match input.split_first() {
803 (Some('/'), remaining) => self.parse_path(
804 scheme_type,
805 &mut true,
806 base_url.path_start as usize,
807 remaining,
808 ),
809 _ => {
810 self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
811 }
812 };
813 self.with_query_and_fragment(
814 scheme_type,
815 base_url.scheme_end,
816 base_url.username_end,
817 base_url.host_start,
818 base_url.host_end,
819 base_url.host,
820 base_url.port,
821 base_url.path_start,
822 remaining,
823 )
824 }
825 }
826 }
827
after_double_slash( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>828 fn after_double_slash(
829 mut self,
830 input: Input<'_>,
831 scheme_type: SchemeType,
832 scheme_end: u32,
833 ) -> ParseResult<Url> {
834 self.serialization.push('/');
835 self.serialization.push('/');
836 // authority state
837 let before_authority = self.serialization.len();
838 let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
839 let has_authority = before_authority != self.serialization.len();
840 // host state
841 let host_start = to_u32(self.serialization.len())?;
842 let (host_end, host, port, remaining) =
843 self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
844 if host == HostInternal::None && has_authority {
845 return Err(ParseError::EmptyHost);
846 }
847 // path state
848 let path_start = to_u32(self.serialization.len())?;
849 let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
850 self.with_query_and_fragment(
851 scheme_type,
852 scheme_end,
853 username_end,
854 host_start,
855 host_end,
856 host,
857 port,
858 path_start,
859 remaining,
860 )
861 }
862
863 /// Return (username_end, remaining)
parse_userinfo<'i>( &mut self, mut input: Input<'i>, scheme_type: SchemeType, ) -> ParseResult<(u32, Input<'i>)>864 fn parse_userinfo<'i>(
865 &mut self,
866 mut input: Input<'i>,
867 scheme_type: SchemeType,
868 ) -> ParseResult<(u32, Input<'i>)> {
869 let mut last_at = None;
870 let mut remaining = input.clone();
871 let mut char_count = 0;
872 while let Some(c) = remaining.next() {
873 match c {
874 '@' => {
875 if last_at.is_some() {
876 self.log_violation(SyntaxViolation::UnencodedAtSign)
877 } else {
878 self.log_violation(SyntaxViolation::EmbeddedCredentials)
879 }
880 last_at = Some((char_count, remaining.clone()))
881 }
882 '/' | '?' | '#' => break,
883 '\\' if scheme_type.is_special() => break,
884 _ => (),
885 }
886 char_count += 1;
887 }
888 let (mut userinfo_char_count, remaining) = match last_at {
889 None => return Ok((to_u32(self.serialization.len())?, input)),
890 Some((0, remaining)) => {
891 // Otherwise, if one of the following is true
892 // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
893 // url is special and c is U+005C (\)
894 // If @ flag is set and buffer is the empty string, validation error, return failure.
895 if let (Some(c), _) = remaining.split_first() {
896 if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
897 return Err(ParseError::EmptyHost);
898 }
899 }
900 return Ok((to_u32(self.serialization.len())?, remaining));
901 }
902 Some(x) => x,
903 };
904
905 let mut username_end = None;
906 let mut has_password = false;
907 let mut has_username = false;
908 while userinfo_char_count > 0 {
909 let (c, utf8_c) = input.next_utf8().unwrap();
910 userinfo_char_count -= 1;
911 if c == ':' && username_end.is_none() {
912 // Start parsing password
913 username_end = Some(to_u32(self.serialization.len())?);
914 // We don't add a colon if the password is empty
915 if userinfo_char_count > 0 {
916 self.serialization.push(':');
917 has_password = true;
918 }
919 } else {
920 if !has_password {
921 has_username = true;
922 }
923 self.check_url_code_point(c, &input);
924 self.serialization
925 .extend(utf8_percent_encode(utf8_c, USERINFO));
926 }
927 }
928 let username_end = match username_end {
929 Some(i) => i,
930 None => to_u32(self.serialization.len())?,
931 };
932 if has_username || has_password {
933 self.serialization.push('@');
934 }
935 Ok((username_end, remaining))
936 }
937
parse_host_and_port<'i>( &mut self, input: Input<'i>, scheme_end: u32, scheme_type: SchemeType, ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)>938 fn parse_host_and_port<'i>(
939 &mut self,
940 input: Input<'i>,
941 scheme_end: u32,
942 scheme_type: SchemeType,
943 ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
944 let (host, remaining) = Parser::parse_host(input, scheme_type)?;
945 write!(&mut self.serialization, "{}", host).unwrap();
946 let host_end = to_u32(self.serialization.len())?;
947 if let Host::Domain(h) = &host {
948 if h.is_empty() {
949 // Port with an empty host
950 if remaining.starts_with(":") {
951 return Err(ParseError::EmptyHost);
952 }
953 if scheme_type.is_special() {
954 return Err(ParseError::EmptyHost);
955 }
956 }
957 };
958
959 let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
960 let scheme = || default_port(&self.serialization[..scheme_end as usize]);
961 Parser::parse_port(remaining, scheme, self.context)?
962 } else {
963 (None, remaining)
964 };
965 if let Some(port) = port {
966 write!(&mut self.serialization, ":{}", port).unwrap()
967 }
968 Ok((host_end, host.into(), port, remaining))
969 }
970
parse_host( mut input: Input<'_>, scheme_type: SchemeType, ) -> ParseResult<(Host<String>, Input<'_>)>971 pub fn parse_host(
972 mut input: Input<'_>,
973 scheme_type: SchemeType,
974 ) -> ParseResult<(Host<String>, Input<'_>)> {
975 if scheme_type.is_file() {
976 return Parser::get_file_host(input);
977 }
978 // Undo the Input abstraction here to avoid allocating in the common case
979 // where the host part of the input does not contain any tab or newline
980 let input_str = input.chars.as_str();
981 let mut inside_square_brackets = false;
982 let mut has_ignored_chars = false;
983 let mut non_ignored_chars = 0;
984 let mut bytes = 0;
985 for c in input_str.chars() {
986 match c {
987 ':' if !inside_square_brackets => break,
988 '\\' if scheme_type.is_special() => break,
989 '/' | '?' | '#' => break,
990 '\t' | '\n' | '\r' => {
991 has_ignored_chars = true;
992 }
993 '[' => {
994 inside_square_brackets = true;
995 non_ignored_chars += 1
996 }
997 ']' => {
998 inside_square_brackets = false;
999 non_ignored_chars += 1
1000 }
1001 _ => non_ignored_chars += 1,
1002 }
1003 bytes += c.len_utf8();
1004 }
1005 let replaced: String;
1006 let host_str;
1007 {
1008 let host_input = input.by_ref().take(non_ignored_chars);
1009 if has_ignored_chars {
1010 replaced = host_input.collect();
1011 host_str = &*replaced
1012 } else {
1013 for _ in host_input {}
1014 host_str = &input_str[..bytes]
1015 }
1016 }
1017 if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1018 return Err(ParseError::EmptyHost);
1019 }
1020 if !scheme_type.is_special() {
1021 let host = Host::parse_opaque(host_str)?;
1022 return Ok((host, input));
1023 }
1024 let host = Host::parse(host_str)?;
1025 Ok((host, input))
1026 }
1027
get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)>1028 fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1029 let (_, host_str, remaining) = Parser::file_host(input)?;
1030 let host = match Host::parse(&host_str)? {
1031 Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1032 host => host,
1033 };
1034 Ok((host, remaining))
1035 }
1036
parse_file_host<'i>( &mut self, input: Input<'i>, ) -> ParseResult<(bool, HostInternal, Input<'i>)>1037 fn parse_file_host<'i>(
1038 &mut self,
1039 input: Input<'i>,
1040 ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1041 let has_host;
1042 let (_, host_str, remaining) = Parser::file_host(input)?;
1043 let host = if host_str.is_empty() {
1044 has_host = false;
1045 HostInternal::None
1046 } else {
1047 match Host::parse(&host_str)? {
1048 Host::Domain(ref d) if d == "localhost" => {
1049 has_host = false;
1050 HostInternal::None
1051 }
1052 host => {
1053 write!(&mut self.serialization, "{}", host).unwrap();
1054 has_host = true;
1055 host.into()
1056 }
1057 }
1058 };
1059 Ok((has_host, host, remaining))
1060 }
1061
file_host(input: Input) -> ParseResult<(bool, String, Input)>1062 pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1063 // Undo the Input abstraction here to avoid allocating in the common case
1064 // where the host part of the input does not contain any tab or newline
1065 let input_str = input.chars.as_str();
1066 let mut has_ignored_chars = false;
1067 let mut non_ignored_chars = 0;
1068 let mut bytes = 0;
1069 for c in input_str.chars() {
1070 match c {
1071 '/' | '\\' | '?' | '#' => break,
1072 '\t' | '\n' | '\r' => has_ignored_chars = true,
1073 _ => non_ignored_chars += 1,
1074 }
1075 bytes += c.len_utf8();
1076 }
1077 let replaced: String;
1078 let host_str;
1079 let mut remaining = input.clone();
1080 {
1081 let host_input = remaining.by_ref().take(non_ignored_chars);
1082 if has_ignored_chars {
1083 replaced = host_input.collect();
1084 host_str = &*replaced
1085 } else {
1086 for _ in host_input {}
1087 host_str = &input_str[..bytes]
1088 }
1089 }
1090 if is_windows_drive_letter(host_str) {
1091 return Ok((false, "".to_string(), input));
1092 }
1093 Ok((true, host_str.to_string(), remaining))
1094 }
1095
parse_port<P>( mut input: Input<'_>, default_port: P, context: Context, ) -> ParseResult<(Option<u16>, Input<'_>)> where P: Fn() -> Option<u16>,1096 pub fn parse_port<P>(
1097 mut input: Input<'_>,
1098 default_port: P,
1099 context: Context,
1100 ) -> ParseResult<(Option<u16>, Input<'_>)>
1101 where
1102 P: Fn() -> Option<u16>,
1103 {
1104 let mut port: u32 = 0;
1105 let mut has_any_digit = false;
1106 while let (Some(c), remaining) = input.split_first() {
1107 if let Some(digit) = c.to_digit(10) {
1108 port = port * 10 + digit;
1109 if port > ::std::u16::MAX as u32 {
1110 return Err(ParseError::InvalidPort);
1111 }
1112 has_any_digit = true;
1113 } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1114 return Err(ParseError::InvalidPort);
1115 } else {
1116 break;
1117 }
1118 input = remaining;
1119 }
1120 let mut opt_port = Some(port as u16);
1121 if !has_any_digit || opt_port == default_port() {
1122 opt_port = None;
1123 }
1124 Ok((opt_port, input))
1125 }
1126
parse_path_start<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, input: Input<'i>, ) -> Input<'i>1127 pub fn parse_path_start<'i>(
1128 &mut self,
1129 scheme_type: SchemeType,
1130 has_host: &mut bool,
1131 input: Input<'i>,
1132 ) -> Input<'i> {
1133 let path_start = self.serialization.len();
1134 let (maybe_c, remaining) = input.split_first();
1135 // If url is special, then:
1136 if scheme_type.is_special() {
1137 if maybe_c == Some('\\') {
1138 // If c is U+005C (\), validation error.
1139 self.log_violation(SyntaxViolation::Backslash);
1140 }
1141 // A special URL always has a non-empty path.
1142 if !self.serialization.ends_with('/') {
1143 self.serialization.push('/');
1144 // We have already made sure the forward slash is present.
1145 if maybe_c == Some('/') || maybe_c == Some('\\') {
1146 return self.parse_path(scheme_type, has_host, path_start, remaining);
1147 }
1148 }
1149 return self.parse_path(scheme_type, has_host, path_start, input);
1150 } else if maybe_c == Some('?') || maybe_c == Some('#') {
1151 // Otherwise, if state override is not given and c is U+003F (?),
1152 // set url’s query to the empty string and state to query state.
1153 // Otherwise, if state override is not given and c is U+0023 (#),
1154 // set url’s fragment to the empty string and state to fragment state.
1155 // The query and path states will be handled by the caller.
1156 return input;
1157 }
1158
1159 if maybe_c != None && maybe_c != Some('/') {
1160 self.serialization.push('/');
1161 }
1162 // Otherwise, if c is not the EOF code point:
1163 self.parse_path(scheme_type, has_host, path_start, input)
1164 }
1165
parse_path<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, path_start: usize, mut input: Input<'i>, ) -> Input<'i>1166 pub fn parse_path<'i>(
1167 &mut self,
1168 scheme_type: SchemeType,
1169 has_host: &mut bool,
1170 path_start: usize,
1171 mut input: Input<'i>,
1172 ) -> Input<'i> {
1173 // Relative path state
1174 loop {
1175 let segment_start = self.serialization.len();
1176 let mut ends_with_slash = false;
1177 loop {
1178 let input_before_c = input.clone();
1179 let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1180 x
1181 } else {
1182 break;
1183 };
1184 match c {
1185 '/' if self.context != Context::PathSegmentSetter => {
1186 self.serialization.push(c);
1187 ends_with_slash = true;
1188 break;
1189 }
1190 '\\' if self.context != Context::PathSegmentSetter
1191 && scheme_type.is_special() =>
1192 {
1193 self.log_violation(SyntaxViolation::Backslash);
1194 self.serialization.push('/');
1195 ends_with_slash = true;
1196 break;
1197 }
1198 '?' | '#' if self.context == Context::UrlParser => {
1199 input = input_before_c;
1200 break;
1201 }
1202 _ => {
1203 self.check_url_code_point(c, &input);
1204 if self.context == Context::PathSegmentSetter {
1205 if scheme_type.is_special() {
1206 self.serialization
1207 .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1208 } else {
1209 self.serialization
1210 .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1211 }
1212 } else {
1213 self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1214 }
1215 }
1216 }
1217 }
1218 let segment_before_slash = if ends_with_slash {
1219 &self.serialization[segment_start..self.serialization.len() - 1]
1220 } else {
1221 &self.serialization[segment_start..self.serialization.len()]
1222 };
1223 match segment_before_slash {
1224 // If buffer is a double-dot path segment, shorten url’s path,
1225 ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1226 | ".%2E" => {
1227 debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1228 self.serialization.truncate(segment_start);
1229 if self.serialization.ends_with('/')
1230 && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1231 {
1232 self.serialization.pop();
1233 }
1234 self.shorten_path(scheme_type, path_start);
1235
1236 // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1237 if ends_with_slash && !self.serialization.ends_with('/') {
1238 self.serialization.push('/');
1239 }
1240 }
1241 // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1242 // nor url is special and c is U+005C (\), append the empty string to url’s path.
1243 "." | "%2e" | "%2E" => {
1244 self.serialization.truncate(segment_start);
1245 if !self.serialization.ends_with('/') {
1246 self.serialization.push('/');
1247 }
1248 }
1249 _ => {
1250 // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1251 if scheme_type.is_file() && is_windows_drive_letter(segment_before_slash) {
1252 // Replace the second code point in buffer with U+003A (:).
1253 if let Some(c) = segment_before_slash.chars().next() {
1254 self.serialization.truncate(segment_start);
1255 self.serialization.push(c);
1256 self.serialization.push(':');
1257 if ends_with_slash {
1258 self.serialization.push('/');
1259 }
1260 }
1261 // If url’s host is neither the empty string nor null,
1262 // validation error, set url’s host to the empty string.
1263 if *has_host {
1264 self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1265 *has_host = false; // FIXME account for this in callers
1266 }
1267 }
1268 }
1269 }
1270 if !ends_with_slash {
1271 break;
1272 }
1273 }
1274 if scheme_type.is_file() {
1275 // while url’s path’s size is greater than 1
1276 // and url’s path[0] is the empty string,
1277 // validation error, remove the first item from url’s path.
1278 //FIXME: log violation
1279 let path = self.serialization.split_off(path_start);
1280 self.serialization.push('/');
1281 self.serialization.push_str(path.trim_start_matches('/'));
1282 }
1283
1284 input
1285 }
1286
last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool1287 fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1288 let url_before_segment = &serialization[..serialization.len() - 1];
1289 if let Some(segment_before_start) = url_before_segment.rfind('/') {
1290 // Do not remove the root slash
1291 segment_before_start >= path_start
1292 // Or a windows drive letter slash
1293 && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1294 } else {
1295 false
1296 }
1297 }
1298
1299 /// https://url.spec.whatwg.org/#shorten-a-urls-path
shorten_path(&mut self, scheme_type: SchemeType, path_start: usize)1300 fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1301 // If path is empty, then return.
1302 if self.serialization.len() == path_start {
1303 return;
1304 }
1305 // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1306 if scheme_type.is_file()
1307 && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1308 {
1309 return;
1310 }
1311 // Remove path’s last item.
1312 self.pop_path(scheme_type, path_start);
1313 }
1314
1315 /// https://url.spec.whatwg.org/#pop-a-urls-path
pop_path(&mut self, scheme_type: SchemeType, path_start: usize)1316 fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1317 if self.serialization.len() > path_start {
1318 let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1319 // + 1 since rfind returns the position before the slash.
1320 let segment_start = path_start + slash_position + 1;
1321 // Don’t pop a Windows drive letter
1322 if !(scheme_type.is_file()
1323 && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1324 {
1325 self.serialization.truncate(segment_start);
1326 }
1327 }
1328 }
1329
parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i>1330 pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1331 loop {
1332 let input_before_c = input.clone();
1333 match input.next_utf8() {
1334 Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1335 return input_before_c
1336 }
1337 Some((c, utf8_c)) => {
1338 self.check_url_code_point(c, &input);
1339 self.serialization
1340 .extend(utf8_percent_encode(utf8_c, CONTROLS));
1341 }
1342 None => return input,
1343 }
1344 }
1345 }
1346
1347 #[allow(clippy::too_many_arguments)]
with_query_and_fragment( mut self, scheme_type: SchemeType, scheme_end: u32, username_end: u32, host_start: u32, host_end: u32, host: HostInternal, port: Option<u16>, path_start: u32, remaining: Input<'_>, ) -> ParseResult<Url>1348 fn with_query_and_fragment(
1349 mut self,
1350 scheme_type: SchemeType,
1351 scheme_end: u32,
1352 username_end: u32,
1353 host_start: u32,
1354 host_end: u32,
1355 host: HostInternal,
1356 port: Option<u16>,
1357 path_start: u32,
1358 remaining: Input<'_>,
1359 ) -> ParseResult<Url> {
1360 let (query_start, fragment_start) =
1361 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1362 Ok(Url {
1363 serialization: self.serialization,
1364 scheme_end,
1365 username_end,
1366 host_start,
1367 host_end,
1368 host,
1369 port,
1370 path_start,
1371 query_start,
1372 fragment_start,
1373 })
1374 }
1375
1376 /// Return (query_start, fragment_start)
parse_query_and_fragment( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'_>, ) -> ParseResult<(Option<u32>, Option<u32>)>1377 fn parse_query_and_fragment(
1378 &mut self,
1379 scheme_type: SchemeType,
1380 scheme_end: u32,
1381 mut input: Input<'_>,
1382 ) -> ParseResult<(Option<u32>, Option<u32>)> {
1383 let mut query_start = None;
1384 match input.next() {
1385 Some('#') => {}
1386 Some('?') => {
1387 query_start = Some(to_u32(self.serialization.len())?);
1388 self.serialization.push('?');
1389 let remaining = self.parse_query(scheme_type, scheme_end, input);
1390 if let Some(remaining) = remaining {
1391 input = remaining
1392 } else {
1393 return Ok((query_start, None));
1394 }
1395 }
1396 None => return Ok((None, None)),
1397 _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1398 }
1399
1400 let fragment_start = to_u32(self.serialization.len())?;
1401 self.serialization.push('#');
1402 self.parse_fragment(input);
1403 Ok((query_start, Some(fragment_start)))
1404 }
1405
parse_query<'i>( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'i>, ) -> Option<Input<'i>>1406 pub fn parse_query<'i>(
1407 &mut self,
1408 scheme_type: SchemeType,
1409 scheme_end: u32,
1410 mut input: Input<'i>,
1411 ) -> Option<Input<'i>> {
1412 let len = input.chars.as_str().len();
1413 let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1414 let mut remaining = None;
1415 while let Some(c) = input.next() {
1416 if c == '#' && self.context == Context::UrlParser {
1417 remaining = Some(input);
1418 break;
1419 } else {
1420 self.check_url_code_point(c, &input);
1421 query.push(c);
1422 }
1423 }
1424
1425 let encoding = match &self.serialization[..scheme_end as usize] {
1426 "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1427 _ => None,
1428 };
1429 let query_bytes = if let Some(o) = encoding {
1430 o(&query)
1431 } else {
1432 query.as_bytes().into()
1433 };
1434 let set = if scheme_type.is_special() {
1435 SPECIAL_QUERY
1436 } else {
1437 QUERY
1438 };
1439 self.serialization.extend(percent_encode(&query_bytes, set));
1440 remaining
1441 }
1442
fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url>1443 fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1444 let before_fragment = match base_url.fragment_start {
1445 Some(i) => base_url.slice(..i),
1446 None => &*base_url.serialization,
1447 };
1448 debug_assert!(self.serialization.is_empty());
1449 self.serialization
1450 .reserve(before_fragment.len() + input.chars.as_str().len());
1451 self.serialization.push_str(before_fragment);
1452 self.serialization.push('#');
1453 let next = input.next();
1454 debug_assert!(next == Some('#'));
1455 self.parse_fragment(input);
1456 Ok(Url {
1457 serialization: self.serialization,
1458 fragment_start: Some(to_u32(before_fragment.len())?),
1459 ..*base_url
1460 })
1461 }
1462
parse_fragment(&mut self, mut input: Input<'_>)1463 pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1464 while let Some((c, utf8_c)) = input.next_utf8() {
1465 if c == '\0' {
1466 self.log_violation(SyntaxViolation::NullInFragment)
1467 } else {
1468 self.check_url_code_point(c, &input);
1469 }
1470 self.serialization
1471 .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1472 }
1473 }
1474
check_url_code_point(&self, c: char, input: &Input<'_>)1475 fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1476 if let Some(vfn) = self.violation_fn {
1477 if c == '%' {
1478 let mut input = input.clone();
1479 if !matches!((input.next(), input.next()), (Some(a), Some(b))
1480 if is_ascii_hex_digit(a) && is_ascii_hex_digit(b))
1481 {
1482 vfn(SyntaxViolation::PercentDecode)
1483 }
1484 } else if !is_url_code_point(c) {
1485 vfn(SyntaxViolation::NonUrlCodePoint)
1486 }
1487 }
1488 }
1489 }
1490
1491 #[inline]
is_ascii_hex_digit(c: char) -> bool1492 fn is_ascii_hex_digit(c: char) -> bool {
1493 matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9')
1494 }
1495
1496 // Non URL code points:
1497 // U+0000 to U+0020 (space)
1498 // " # % < > [ \ ] ^ ` { | }
1499 // U+007F to U+009F
1500 // surrogates
1501 // U+FDD0 to U+FDEF
1502 // Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1503 #[inline]
is_url_code_point(c: char) -> bool1504 fn is_url_code_point(c: char) -> bool {
1505 matches!(c,
1506 'a'..='z' |
1507 'A'..='Z' |
1508 '0'..='9' |
1509 '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1510 '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1511 '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1512 '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1513 '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1514 '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1515 '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1516 '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1517 '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1518 '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1519 '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1520 }
1521
1522 /// https://url.spec.whatwg.org/#c0-controls-and-space
1523 #[inline]
c0_control_or_space(ch: char) -> bool1524 fn c0_control_or_space(ch: char) -> bool {
1525 ch <= ' ' // U+0000 to U+0020
1526 }
1527
1528 /// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1529 #[inline]
ascii_tab_or_new_line(ch: char) -> bool1530 fn ascii_tab_or_new_line(ch: char) -> bool {
1531 matches!(ch, '\t' | '\r' | '\n')
1532 }
1533
1534 /// https://url.spec.whatwg.org/#ascii-alpha
1535 #[inline]
ascii_alpha(ch: char) -> bool1536 pub fn ascii_alpha(ch: char) -> bool {
1537 matches!(ch, 'a'..='z' | 'A'..='Z')
1538 }
1539
1540 #[inline]
to_u32(i: usize) -> ParseResult<u32>1541 pub fn to_u32(i: usize) -> ParseResult<u32> {
1542 if i <= ::std::u32::MAX as usize {
1543 Ok(i as u32)
1544 } else {
1545 Err(ParseError::Overflow)
1546 }
1547 }
1548
is_normalized_windows_drive_letter(segment: &str) -> bool1549 fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1550 is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1551 }
1552
1553 /// Whether the scheme is file:, the path has a single segment, and that segment
1554 /// is a Windows drive letter
1555 #[inline]
is_windows_drive_letter(segment: &str) -> bool1556 pub fn is_windows_drive_letter(segment: &str) -> bool {
1557 segment.len() == 2 && starts_with_windows_drive_letter(segment)
1558 }
1559
1560 /// Whether path starts with a root slash
1561 /// and a windows drive letter eg: "/c:" or "/a:/"
path_starts_with_windows_drive_letter(s: &str) -> bool1562 fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1563 if let Some(c) = s.as_bytes().first() {
1564 matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1565 } else {
1566 false
1567 }
1568 }
1569
starts_with_windows_drive_letter(s: &str) -> bool1570 fn starts_with_windows_drive_letter(s: &str) -> bool {
1571 s.len() >= 2
1572 && ascii_alpha(s.as_bytes()[0] as char)
1573 && matches!(s.as_bytes()[1], b':' | b'|')
1574 && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1575 }
1576
1577 /// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool1578 fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1579 let mut input = input.clone();
1580 match (input.next(), input.next(), input.next()) {
1581 // its first two code points are a Windows drive letter
1582 // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1583 (Some(a), Some(b), Some(c))
1584 if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1585 {
1586 true
1587 }
1588 // its first two code points are a Windows drive letter
1589 // its length is 2
1590 (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1591 _ => false,
1592 }
1593 }
1594