1 // pest. The Elegant Parser 2 // Copyright (c) 2018 Dragoș Tiselice 3 // 4 // Licensed under the Apache License, Version 2.0 5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT 6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. All files in the project carrying such notice may not be copied, 8 // modified, or distributed except according to those terms. 9 10 use core::cmp::Ordering; 11 use core::fmt; 12 use core::hash::{Hash, Hasher}; 13 use core::ops::Range; 14 use core::ptr; 15 use core::str; 16 17 use crate::span; 18 19 /// A cursor position in a `&str` which provides useful methods to manually parse that string. 20 #[derive(Clone, Copy)] 21 pub struct Position<'i> { 22 input: &'i str, 23 /// # Safety: 24 /// 25 /// `input[pos..]` must be a valid codepoint boundary (should not panic when indexing thus). 26 pos: usize, 27 } 28 29 impl<'i> Position<'i> { 30 /// Create a new `Position` without checking invariants. (Checked with `debug_assertions`.) 31 /// 32 /// # Safety: 33 /// 34 /// `input[pos..]` must be a valid codepoint boundary (should not panic when indexing thus). new_unchecked(input: &str, pos: usize) -> Position<'_>35 pub(crate) unsafe fn new_unchecked(input: &str, pos: usize) -> Position<'_> { 36 debug_assert!(input.get(pos..).is_some()); 37 Position { input, pos } 38 } 39 40 /// Attempts to create a new `Position` at the given position. If the specified position is 41 /// an invalid index, or the specified position is not a valid UTF8 boundary, then None is 42 /// returned. 43 /// 44 /// # Examples 45 /// ``` 46 /// # use pest::Position; 47 /// let cheart = ''; 48 /// let heart = ""; 49 /// assert_eq!(Position::new(heart, 1), None); 50 /// assert_ne!(Position::new(heart, cheart.len_utf8()), None); 51 /// ``` new(input: &str, pos: usize) -> Option<Position<'_>>52 pub fn new(input: &str, pos: usize) -> Option<Position<'_>> { 53 input.get(pos..).map(|_| Position { input, pos }) 54 } 55 56 /// Creates a `Position` at the start of a `&str`. 57 /// 58 /// # Examples 59 /// 60 /// ``` 61 /// # use pest::Position; 62 /// let start = Position::from_start(""); 63 /// assert_eq!(start.pos(), 0); 64 /// ``` 65 #[inline] from_start(input: &'i str) -> Position<'i>66 pub fn from_start(input: &'i str) -> Position<'i> { 67 // Position 0 is always safe because it's always a valid UTF-8 border. 68 Position { input, pos: 0 } 69 } 70 71 /// Returns the byte position of this `Position` as a `usize`. 72 /// 73 /// # Examples 74 /// 75 /// ``` 76 /// # use pest::Position; 77 /// let input = "ab"; 78 /// let mut start = Position::from_start(input); 79 /// 80 /// assert_eq!(start.pos(), 0); 81 /// ``` 82 #[inline] pos(&self) -> usize83 pub fn pos(&self) -> usize { 84 self.pos 85 } 86 87 /// Creates a `Span` from two `Position`s. 88 /// 89 /// # Panics 90 /// 91 /// Panics if the positions come from different inputs. 92 /// 93 /// # Examples 94 /// 95 /// ``` 96 /// # use pest::Position; 97 /// let input = "ab"; 98 /// let start = Position::from_start(input); 99 /// let span = start.span(&start.clone()); 100 /// 101 /// assert_eq!(span.start(), 0); 102 /// assert_eq!(span.end(), 0); 103 /// ``` 104 #[inline] span(&self, other: &Position<'i>) -> span::Span<'i>105 pub fn span(&self, other: &Position<'i>) -> span::Span<'i> { 106 if ptr::eq(self.input, other.input) 107 /* && self.input.get(self.pos..other.pos).is_some() */ 108 { 109 // This is safe because the pos field of a Position should always be a valid str index. 110 unsafe { span::Span::new_unchecked(self.input, self.pos, other.pos) } 111 } else { 112 // TODO: maybe a panic if self.pos < other.pos 113 panic!("span created from positions from different inputs") 114 } 115 } 116 117 /// Returns the line and column number of this `Position`. 118 /// 119 /// This is an O(n) operation, where n is the number of chars in the input. 120 /// You better use [`pair.line_col()`](struct.Pair.html#method.line_col) instead. 121 /// 122 /// # Examples 123 /// 124 /// ``` 125 /// # use pest; 126 /// # #[allow(non_camel_case_types)] 127 /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 128 /// enum Rule {} 129 /// 130 /// let input = "\na"; 131 /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input); 132 /// let mut result = state.match_string("\na"); 133 /// assert!(result.is_ok()); 134 /// assert_eq!(result.unwrap().position().line_col(), (2, 2)); 135 /// ``` 136 #[inline] line_col(&self) -> (usize, usize)137 pub fn line_col(&self) -> (usize, usize) { 138 if self.pos > self.input.len() { 139 panic!("position out of bounds"); 140 } 141 let mut pos = self.pos; 142 let slice = &self.input[..pos]; 143 let mut chars = slice.chars().peekable(); 144 145 let mut line_col = (1, 1); 146 147 while pos != 0 { 148 match chars.next() { 149 Some('\r') => { 150 if let Some(&'\n') = chars.peek() { 151 chars.next(); 152 153 if pos == 1 { 154 pos -= 1; 155 } else { 156 pos -= 2; 157 } 158 159 line_col = (line_col.0 + 1, 1); 160 } else { 161 pos -= 1; 162 line_col = (line_col.0, line_col.1 + 1); 163 } 164 } 165 Some('\n') => { 166 pos -= 1; 167 line_col = (line_col.0 + 1, 1); 168 } 169 Some(c) => { 170 pos -= c.len_utf8(); 171 line_col = (line_col.0, line_col.1 + 1); 172 } 173 None => unreachable!(), 174 } 175 } 176 177 line_col 178 } 179 180 /// Returns the entire line of the input that contains this `Position`. 181 /// 182 /// # Examples 183 /// 184 /// ``` 185 /// # use pest; 186 /// # #[allow(non_camel_case_types)] 187 /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 188 /// enum Rule {} 189 /// 190 /// let input = "\na"; 191 /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input); 192 /// let mut result = state.match_string("\na"); 193 /// assert!(result.is_ok()); 194 /// assert_eq!(result.unwrap().position().line_of(), "a"); 195 /// ``` 196 #[inline] line_of(&self) -> &'i str197 pub fn line_of(&self) -> &'i str { 198 if self.pos > self.input.len() { 199 panic!("position out of bounds"); 200 }; 201 // Safe since start and end can only be valid UTF-8 borders. 202 &self.input[self.find_line_start()..self.find_line_end()] 203 } 204 find_line_start(&self) -> usize205 pub(crate) fn find_line_start(&self) -> usize { 206 if self.input.is_empty() { 207 return 0; 208 }; 209 // Position's pos is always a UTF-8 border. 210 let start = self 211 .input 212 .char_indices() 213 .rev() 214 .skip_while(|&(i, _)| i >= self.pos) 215 .find(|&(_, c)| c == '\n'); 216 match start { 217 Some((i, _)) => i + 1, 218 None => 0, 219 } 220 } 221 find_line_end(&self) -> usize222 pub(crate) fn find_line_end(&self) -> usize { 223 if self.input.is_empty() { 224 0 225 } else if self.pos == self.input.len() - 1 { 226 self.input.len() 227 } else { 228 // Position's pos is always a UTF-8 border. 229 let end = self 230 .input 231 .char_indices() 232 .skip_while(|&(i, _)| i < self.pos) 233 .find(|&(_, c)| c == '\n'); 234 match end { 235 Some((i, _)) => i + 1, 236 None => self.input.len(), 237 } 238 } 239 } 240 241 /// Returns `true` when the `Position` points to the start of the input `&str`. 242 #[inline] at_start(&self) -> bool243 pub(crate) fn at_start(&self) -> bool { 244 self.pos == 0 245 } 246 247 /// Returns `true` when the `Position` points to the end of the input `&str`. 248 #[inline] at_end(&self) -> bool249 pub(crate) fn at_end(&self) -> bool { 250 self.pos == self.input.len() 251 } 252 253 /// Skips `n` `char`s from the `Position` and returns `true` if the skip was possible or `false` 254 /// otherwise. If the return value is `false`, `pos` will not be updated. 255 #[inline] skip(&mut self, n: usize) -> bool256 pub(crate) fn skip(&mut self, n: usize) -> bool { 257 let skipped = { 258 let mut len = 0; 259 // Position's pos is always a UTF-8 border. 260 let mut chars = self.input[self.pos..].chars(); 261 for _ in 0..n { 262 if let Some(c) = chars.next() { 263 len += c.len_utf8(); 264 } else { 265 return false; 266 } 267 } 268 len 269 }; 270 271 self.pos += skipped; 272 true 273 } 274 275 /// Goes back `n` `char`s from the `Position` and returns `true` if the skip was possible or `false` 276 /// otherwise. If the return value is `false`, `pos` will not be updated. 277 #[inline] skip_back(&mut self, n: usize) -> bool278 pub(crate) fn skip_back(&mut self, n: usize) -> bool { 279 let skipped = { 280 let mut len = 0; 281 // Position's pos is always a UTF-8 border. 282 let mut chars = self.input[..self.pos].chars().rev(); 283 for _ in 0..n { 284 if let Some(c) = chars.next() { 285 len += c.len_utf8(); 286 } else { 287 return false; 288 } 289 } 290 len 291 }; 292 293 self.pos -= skipped; 294 true 295 } 296 297 /// Skips until one of the given `strings` is found. If none of the `strings` can be found, 298 /// this function will return `false` but its `pos` will *still* be updated. 299 #[inline] skip_until(&mut self, strings: &[&str]) -> bool300 pub(crate) fn skip_until(&mut self, strings: &[&str]) -> bool { 301 #[cfg(not(feature = "memchr"))] 302 { 303 self.skip_until_basic(strings) 304 } 305 #[cfg(feature = "memchr")] 306 { 307 match strings { 308 [] => (), 309 [s1] => { 310 if let Some(from) = 311 memchr::memmem::find(&self.input.as_bytes()[self.pos..], s1.as_bytes()) 312 { 313 self.pos += from; 314 return true; 315 } 316 } 317 [s1, s2] if !s1.is_empty() && !s2.is_empty() => { 318 let b1 = s1.as_bytes()[0]; 319 let b2 = s2.as_bytes()[0]; 320 let miter = memchr::memchr2_iter(b1, b2, &self.input.as_bytes()[self.pos..]); 321 for from in miter { 322 let start = &self.input[self.pos + from..]; 323 if start.starts_with(s1) || start.starts_with(s2) { 324 self.pos += from; 325 return true; 326 } 327 } 328 } 329 [s1, s2, s3] if !s1.is_empty() && !s2.is_empty() && s3.is_empty() => { 330 let b1 = s1.as_bytes()[0]; 331 let b2 = s2.as_bytes()[0]; 332 let b3 = s2.as_bytes()[0]; 333 let miter = 334 memchr::memchr3_iter(b1, b2, b3, &self.input.as_bytes()[self.pos..]); 335 for from in miter { 336 let start = &self.input[self.pos + from..]; 337 if start.starts_with(s1) || start.starts_with(s2) || start.starts_with(s3) { 338 self.pos += from; 339 return true; 340 } 341 } 342 } 343 _ => { 344 return self.skip_until_basic(strings); 345 } 346 } 347 self.pos = self.input.len(); 348 false 349 } 350 } 351 352 #[inline] skip_until_basic(&mut self, strings: &[&str]) -> bool353 fn skip_until_basic(&mut self, strings: &[&str]) -> bool { 354 // TODO: optimize with Aho-Corasick, e.g. https://crates.io/crates/daachorse? 355 for from in self.pos..self.input.len() { 356 let bytes = if let Some(string) = self.input.get(from..) { 357 string.as_bytes() 358 } else { 359 continue; 360 }; 361 362 for slice in strings.iter() { 363 let to = slice.len(); 364 if Some(slice.as_bytes()) == bytes.get(0..to) { 365 self.pos = from; 366 return true; 367 } 368 } 369 } 370 371 self.pos = self.input.len(); 372 false 373 } 374 375 /// Matches the char at the `Position` against a specified character and returns `true` if a match 376 /// was made. If no match was made, returns `false`. 377 /// `pos` will not be updated in either case. 378 #[inline] match_char(&self, c: char) -> bool379 pub(crate) fn match_char(&self, c: char) -> bool { 380 matches!(self.input[self.pos..].chars().next(), Some(cc) if c == cc) 381 } 382 383 /// Matches the char at the `Position` against a filter function and returns `true` if a match 384 /// was made. If no match was made, returns `false` and `pos` will not be updated. 385 #[inline] match_char_by<F>(&mut self, f: F) -> bool where F: FnOnce(char) -> bool,386 pub(crate) fn match_char_by<F>(&mut self, f: F) -> bool 387 where 388 F: FnOnce(char) -> bool, 389 { 390 if let Some(c) = self.input[self.pos..].chars().next() { 391 if f(c) { 392 self.pos += c.len_utf8(); 393 true 394 } else { 395 false 396 } 397 } else { 398 false 399 } 400 } 401 402 /// Matches `string` from the `Position` and returns `true` if a match was made or `false` 403 /// otherwise. If no match was made, `pos` will not be updated. 404 #[inline] match_string(&mut self, string: &str) -> bool405 pub(crate) fn match_string(&mut self, string: &str) -> bool { 406 let to = self.pos + string.len(); 407 408 if Some(string.as_bytes()) == self.input.as_bytes().get(self.pos..to) { 409 self.pos = to; 410 true 411 } else { 412 false 413 } 414 } 415 416 /// Case-insensitively matches `string` from the `Position` and returns `true` if a match was 417 /// made or `false` otherwise. If no match was made, `pos` will not be updated. 418 #[inline] match_insensitive(&mut self, string: &str) -> bool419 pub(crate) fn match_insensitive(&mut self, string: &str) -> bool { 420 let matched = { 421 let slice = &self.input[self.pos..]; 422 if let Some(slice) = slice.get(0..string.len()) { 423 slice.eq_ignore_ascii_case(string) 424 } else { 425 false 426 } 427 }; 428 429 if matched { 430 self.pos += string.len(); 431 true 432 } else { 433 false 434 } 435 } 436 437 /// Matches `char` `range` from the `Position` and returns `true` if a match was made or `false` 438 /// otherwise. If no match was made, `pos` will not be updated. 439 #[inline] match_range(&mut self, range: Range<char>) -> bool440 pub(crate) fn match_range(&mut self, range: Range<char>) -> bool { 441 if let Some(c) = self.input[self.pos..].chars().next() { 442 if range.start <= c && c <= range.end { 443 self.pos += c.len_utf8(); 444 return true; 445 } 446 } 447 448 false 449 } 450 } 451 452 impl<'i> fmt::Debug for Position<'i> { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result453 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 454 f.debug_struct("Position").field("pos", &self.pos).finish() 455 } 456 } 457 458 impl<'i> PartialEq for Position<'i> { eq(&self, other: &Position<'i>) -> bool459 fn eq(&self, other: &Position<'i>) -> bool { 460 ptr::eq(self.input, other.input) && self.pos == other.pos 461 } 462 } 463 464 impl<'i> Eq for Position<'i> {} 465 466 impl<'i> PartialOrd for Position<'i> { partial_cmp(&self, other: &Position<'i>) -> Option<Ordering>467 fn partial_cmp(&self, other: &Position<'i>) -> Option<Ordering> { 468 if ptr::eq(self.input, other.input) { 469 self.pos.partial_cmp(&other.pos) 470 } else { 471 None 472 } 473 } 474 } 475 476 impl<'i> Ord for Position<'i> { cmp(&self, other: &Position<'i>) -> Ordering477 fn cmp(&self, other: &Position<'i>) -> Ordering { 478 self.partial_cmp(other) 479 .expect("cannot compare positions from different strs") 480 } 481 } 482 483 impl<'i> Hash for Position<'i> { hash<H: Hasher>(&self, state: &mut H)484 fn hash<H: Hasher>(&self, state: &mut H) { 485 (self.input as *const str).hash(state); 486 self.pos.hash(state); 487 } 488 } 489 490 #[cfg(test)] 491 mod tests { 492 use super::*; 493 494 #[test] empty()495 fn empty() { 496 let input = ""; 497 assert!(Position::new(input, 0).unwrap().match_string("")); 498 assert!(!Position::new(input, 0).unwrap().match_string("a")); 499 } 500 501 #[test] parts()502 fn parts() { 503 let input = "asdasdf"; 504 505 assert!(Position::new(input, 0).unwrap().match_string("asd")); 506 assert!(Position::new(input, 3).unwrap().match_string("asdf")); 507 } 508 509 #[test] line_col()510 fn line_col() { 511 let input = "a\rb\nc\r\nd嗨"; 512 513 assert_eq!(Position::new(input, 0).unwrap().line_col(), (1, 1)); 514 assert_eq!(Position::new(input, 1).unwrap().line_col(), (1, 2)); 515 assert_eq!(Position::new(input, 2).unwrap().line_col(), (1, 3)); 516 assert_eq!(Position::new(input, 3).unwrap().line_col(), (1, 4)); 517 assert_eq!(Position::new(input, 4).unwrap().line_col(), (2, 1)); 518 assert_eq!(Position::new(input, 5).unwrap().line_col(), (2, 2)); 519 assert_eq!(Position::new(input, 6).unwrap().line_col(), (2, 3)); 520 assert_eq!(Position::new(input, 7).unwrap().line_col(), (3, 1)); 521 assert_eq!(Position::new(input, 8).unwrap().line_col(), (3, 2)); 522 assert_eq!(Position::new(input, 11).unwrap().line_col(), (3, 3)); 523 let input = "abcd嗨"; 524 assert_eq!(Position::new(input, 7).unwrap().line_col(), (1, 6)); 525 } 526 527 #[test] line_of()528 fn line_of() { 529 let input = "a\rb\nc\r\nd嗨"; 530 531 assert_eq!(Position::new(input, 0).unwrap().line_of(), "a\rb\n"); 532 assert_eq!(Position::new(input, 1).unwrap().line_of(), "a\rb\n"); 533 assert_eq!(Position::new(input, 2).unwrap().line_of(), "a\rb\n"); 534 assert_eq!(Position::new(input, 3).unwrap().line_of(), "a\rb\n"); 535 assert_eq!(Position::new(input, 4).unwrap().line_of(), "c\r\n"); 536 assert_eq!(Position::new(input, 5).unwrap().line_of(), "c\r\n"); 537 assert_eq!(Position::new(input, 6).unwrap().line_of(), "c\r\n"); 538 assert_eq!(Position::new(input, 7).unwrap().line_of(), "d嗨"); 539 assert_eq!(Position::new(input, 8).unwrap().line_of(), "d嗨"); 540 assert_eq!(Position::new(input, 11).unwrap().line_of(), "d嗨"); 541 } 542 543 #[test] line_of_empty()544 fn line_of_empty() { 545 let input = ""; 546 547 assert_eq!(Position::new(input, 0).unwrap().line_of(), ""); 548 } 549 550 #[test] line_of_new_line()551 fn line_of_new_line() { 552 let input = "\n"; 553 554 assert_eq!(Position::new(input, 0).unwrap().line_of(), "\n"); 555 } 556 557 #[test] line_of_between_new_line()558 fn line_of_between_new_line() { 559 let input = "\n\n"; 560 561 assert_eq!(Position::new(input, 1).unwrap().line_of(), "\n"); 562 } 563 measure_skip(input: &str, pos: usize, n: usize) -> Option<usize>564 fn measure_skip(input: &str, pos: usize, n: usize) -> Option<usize> { 565 let mut p = Position::new(input, pos).unwrap(); 566 if p.skip(n) { 567 Some(p.pos - pos) 568 } else { 569 None 570 } 571 } 572 573 #[test] skip_empty()574 fn skip_empty() { 575 let input = ""; 576 577 assert_eq!(measure_skip(input, 0, 0), Some(0)); 578 assert_eq!(measure_skip(input, 0, 1), None); 579 } 580 581 #[test] skip()582 fn skip() { 583 let input = "d嗨"; 584 585 assert_eq!(measure_skip(input, 0, 0), Some(0)); 586 assert_eq!(measure_skip(input, 0, 1), Some(1)); 587 assert_eq!(measure_skip(input, 1, 1), Some(3)); 588 } 589 590 #[test] skip_until()591 fn skip_until() { 592 let input = "ab ac"; 593 let pos = Position::from_start(input); 594 595 let mut test_pos = pos; 596 test_pos.skip_until(&["a", "b"]); 597 assert_eq!(test_pos.pos(), 0); 598 599 test_pos = pos; 600 test_pos.skip_until(&["b"]); 601 assert_eq!(test_pos.pos(), 1); 602 603 test_pos = pos; 604 test_pos.skip_until(&["ab"]); 605 assert_eq!(test_pos.pos(), 0); 606 607 test_pos = pos; 608 test_pos.skip_until(&["ac", "z"]); 609 assert_eq!(test_pos.pos(), 3); 610 611 test_pos = pos; 612 assert!(!test_pos.skip_until(&["z"])); 613 assert_eq!(test_pos.pos(), 5); 614 } 615 616 #[test] match_range()617 fn match_range() { 618 let input = "b"; 619 620 assert!(Position::new(input, 0).unwrap().match_range('a'..'c')); 621 assert!(Position::new(input, 0).unwrap().match_range('b'..'b')); 622 assert!(!Position::new(input, 0).unwrap().match_range('a'..'a')); 623 assert!(!Position::new(input, 0).unwrap().match_range('c'..'c')); 624 assert!(Position::new(input, 0).unwrap().match_range('a'..'嗨')); 625 } 626 627 #[test] match_insensitive()628 fn match_insensitive() { 629 let input = "AsdASdF"; 630 631 assert!(Position::new(input, 0).unwrap().match_insensitive("asd")); 632 assert!(Position::new(input, 3).unwrap().match_insensitive("asdf")); 633 } 634 635 #[test] cmp()636 fn cmp() { 637 let input = "a"; 638 let start = Position::from_start(input); 639 let mut end = start; 640 641 assert!(end.skip(1)); 642 let result = start.cmp(&end); 643 644 assert_eq!(result, Ordering::Less); 645 } 646 647 #[test] 648 #[should_panic] cmp_panic()649 fn cmp_panic() { 650 let input1 = "a"; 651 let input2 = "b"; 652 let pos1 = Position::from_start(input1); 653 let pos2 = Position::from_start(input2); 654 655 let _ = pos1.cmp(&pos2); 656 } 657 658 #[test] 659 #[cfg(feature = "std")] hash()660 fn hash() { 661 use std::collections::HashSet; 662 663 let input = "a"; 664 let start = Position::from_start(input); 665 let mut positions = HashSet::new(); 666 667 positions.insert(start); 668 } 669 } 670