• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // pest. The Elegant Parser
2 // Copyright (c) 2018 Dragoș Tiselice
3 //
4 // Licensed under the Apache License, Version 2.0
5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. All files in the project carrying such notice may not be copied,
8 // modified, or distributed except according to those terms.
9 
10 use core::cmp::Ordering;
11 use core::fmt;
12 use core::hash::{Hash, Hasher};
13 use core::ops::Range;
14 use core::ptr;
15 use core::str;
16 
17 use crate::span;
18 
19 /// A cursor position in a `&str` which provides useful methods to manually parse that string.
20 #[derive(Clone, Copy)]
21 pub struct Position<'i> {
22     input: &'i str,
23     /// # Safety:
24     ///
25     /// `input[pos..]` must be a valid codepoint boundary (should not panic when indexing thus).
26     pos: usize,
27 }
28 
29 impl<'i> Position<'i> {
30     /// Create a new `Position` without checking invariants. (Checked with `debug_assertions`.)
31     ///
32     /// # Safety:
33     ///
34     /// `input[pos..]` must be a valid codepoint boundary (should not panic when indexing thus).
new_unchecked(input: &str, pos: usize) -> Position<'_>35     pub(crate) unsafe fn new_unchecked(input: &str, pos: usize) -> Position<'_> {
36         debug_assert!(input.get(pos..).is_some());
37         Position { input, pos }
38     }
39 
40     /// Attempts to create a new `Position` at the given position. If the specified position is
41     /// an invalid index, or the specified position is not a valid UTF8 boundary, then None is
42     /// returned.
43     ///
44     /// # Examples
45     /// ```
46     /// # use pest::Position;
47     /// let cheart = '��';
48     /// let heart = "��";
49     /// assert_eq!(Position::new(heart, 1), None);
50     /// assert_ne!(Position::new(heart, cheart.len_utf8()), None);
51     /// ```
new(input: &str, pos: usize) -> Option<Position<'_>>52     pub fn new(input: &str, pos: usize) -> Option<Position<'_>> {
53         input.get(pos..).map(|_| Position { input, pos })
54     }
55 
56     /// Creates a `Position` at the start of a `&str`.
57     ///
58     /// # Examples
59     ///
60     /// ```
61     /// # use pest::Position;
62     /// let start = Position::from_start("");
63     /// assert_eq!(start.pos(), 0);
64     /// ```
65     #[inline]
from_start(input: &'i str) -> Position<'i>66     pub fn from_start(input: &'i str) -> Position<'i> {
67         // Position 0 is always safe because it's always a valid UTF-8 border.
68         Position { input, pos: 0 }
69     }
70 
71     /// Returns the byte position of this `Position` as a `usize`.
72     ///
73     /// # Examples
74     ///
75     /// ```
76     /// # use pest::Position;
77     /// let input = "ab";
78     /// let mut start = Position::from_start(input);
79     ///
80     /// assert_eq!(start.pos(), 0);
81     /// ```
82     #[inline]
pos(&self) -> usize83     pub fn pos(&self) -> usize {
84         self.pos
85     }
86 
87     /// Creates a `Span` from two `Position`s.
88     ///
89     /// # Panics
90     ///
91     /// Panics if the positions come from different inputs.
92     ///
93     /// # Examples
94     ///
95     /// ```
96     /// # use pest::Position;
97     /// let input = "ab";
98     /// let start = Position::from_start(input);
99     /// let span = start.span(&start.clone());
100     ///
101     /// assert_eq!(span.start(), 0);
102     /// assert_eq!(span.end(), 0);
103     /// ```
104     #[inline]
span(&self, other: &Position<'i>) -> span::Span<'i>105     pub fn span(&self, other: &Position<'i>) -> span::Span<'i> {
106         if ptr::eq(self.input, other.input)
107         /* && self.input.get(self.pos..other.pos).is_some() */
108         {
109             // This is safe because the pos field of a Position should always be a valid str index.
110             unsafe { span::Span::new_unchecked(self.input, self.pos, other.pos) }
111         } else {
112             // TODO: maybe a panic if self.pos < other.pos
113             panic!("span created from positions from different inputs")
114         }
115     }
116 
117     /// Returns the line and column number of this `Position`.
118     ///
119     /// This is an O(n) operation, where n is the number of chars in the input.
120     /// You better use [`pair.line_col()`](struct.Pair.html#method.line_col) instead.
121     ///
122     /// # Examples
123     ///
124     /// ```
125     /// # use pest;
126     /// # #[allow(non_camel_case_types)]
127     /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
128     /// enum Rule {}
129     ///
130     /// let input = "\na";
131     /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input);
132     /// let mut result = state.match_string("\na");
133     /// assert!(result.is_ok());
134     /// assert_eq!(result.unwrap().position().line_col(), (2, 2));
135     /// ```
136     #[inline]
line_col(&self) -> (usize, usize)137     pub fn line_col(&self) -> (usize, usize) {
138         if self.pos > self.input.len() {
139             panic!("position out of bounds");
140         }
141         let mut pos = self.pos;
142         let slice = &self.input[..pos];
143         let mut chars = slice.chars().peekable();
144 
145         let mut line_col = (1, 1);
146 
147         while pos != 0 {
148             match chars.next() {
149                 Some('\r') => {
150                     if let Some(&'\n') = chars.peek() {
151                         chars.next();
152 
153                         if pos == 1 {
154                             pos -= 1;
155                         } else {
156                             pos -= 2;
157                         }
158 
159                         line_col = (line_col.0 + 1, 1);
160                     } else {
161                         pos -= 1;
162                         line_col = (line_col.0, line_col.1 + 1);
163                     }
164                 }
165                 Some('\n') => {
166                     pos -= 1;
167                     line_col = (line_col.0 + 1, 1);
168                 }
169                 Some(c) => {
170                     pos -= c.len_utf8();
171                     line_col = (line_col.0, line_col.1 + 1);
172                 }
173                 None => unreachable!(),
174             }
175         }
176 
177         line_col
178     }
179 
180     /// Returns the entire line of the input that contains this `Position`.
181     ///
182     /// # Examples
183     ///
184     /// ```
185     /// # use pest;
186     /// # #[allow(non_camel_case_types)]
187     /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
188     /// enum Rule {}
189     ///
190     /// let input = "\na";
191     /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input);
192     /// let mut result = state.match_string("\na");
193     /// assert!(result.is_ok());
194     /// assert_eq!(result.unwrap().position().line_of(), "a");
195     /// ```
196     #[inline]
line_of(&self) -> &'i str197     pub fn line_of(&self) -> &'i str {
198         if self.pos > self.input.len() {
199             panic!("position out of bounds");
200         };
201         // Safe since start and end can only be valid UTF-8 borders.
202         &self.input[self.find_line_start()..self.find_line_end()]
203     }
204 
find_line_start(&self) -> usize205     pub(crate) fn find_line_start(&self) -> usize {
206         if self.input.is_empty() {
207             return 0;
208         };
209         // Position's pos is always a UTF-8 border.
210         let start = self
211             .input
212             .char_indices()
213             .rev()
214             .skip_while(|&(i, _)| i >= self.pos)
215             .find(|&(_, c)| c == '\n');
216         match start {
217             Some((i, _)) => i + 1,
218             None => 0,
219         }
220     }
221 
find_line_end(&self) -> usize222     pub(crate) fn find_line_end(&self) -> usize {
223         if self.input.is_empty() {
224             0
225         } else if self.pos == self.input.len() - 1 {
226             self.input.len()
227         } else {
228             // Position's pos is always a UTF-8 border.
229             let end = self
230                 .input
231                 .char_indices()
232                 .skip_while(|&(i, _)| i < self.pos)
233                 .find(|&(_, c)| c == '\n');
234             match end {
235                 Some((i, _)) => i + 1,
236                 None => self.input.len(),
237             }
238         }
239     }
240 
241     /// Returns `true` when the `Position` points to the start of the input `&str`.
242     #[inline]
at_start(&self) -> bool243     pub(crate) fn at_start(&self) -> bool {
244         self.pos == 0
245     }
246 
247     /// Returns `true` when the `Position` points to the end of the input `&str`.
248     #[inline]
at_end(&self) -> bool249     pub(crate) fn at_end(&self) -> bool {
250         self.pos == self.input.len()
251     }
252 
253     /// Skips `n` `char`s from the `Position` and returns `true` if the skip was possible or `false`
254     /// otherwise. If the return value is `false`, `pos` will not be updated.
255     #[inline]
skip(&mut self, n: usize) -> bool256     pub(crate) fn skip(&mut self, n: usize) -> bool {
257         let skipped = {
258             let mut len = 0;
259             // Position's pos is always a UTF-8 border.
260             let mut chars = self.input[self.pos..].chars();
261             for _ in 0..n {
262                 if let Some(c) = chars.next() {
263                     len += c.len_utf8();
264                 } else {
265                     return false;
266                 }
267             }
268             len
269         };
270 
271         self.pos += skipped;
272         true
273     }
274 
275     /// Goes back `n` `char`s from the `Position` and returns `true` if the skip was possible or `false`
276     /// otherwise. If the return value is `false`, `pos` will not be updated.
277     #[inline]
skip_back(&mut self, n: usize) -> bool278     pub(crate) fn skip_back(&mut self, n: usize) -> bool {
279         let skipped = {
280             let mut len = 0;
281             // Position's pos is always a UTF-8 border.
282             let mut chars = self.input[..self.pos].chars().rev();
283             for _ in 0..n {
284                 if let Some(c) = chars.next() {
285                     len += c.len_utf8();
286                 } else {
287                     return false;
288                 }
289             }
290             len
291         };
292 
293         self.pos -= skipped;
294         true
295     }
296 
297     /// Skips until one of the given `strings` is found. If none of the `strings` can be found,
298     /// this function will return `false` but its `pos` will *still* be updated.
299     #[inline]
skip_until(&mut self, strings: &[&str]) -> bool300     pub(crate) fn skip_until(&mut self, strings: &[&str]) -> bool {
301         #[cfg(not(feature = "memchr"))]
302         {
303             self.skip_until_basic(strings)
304         }
305         #[cfg(feature = "memchr")]
306         {
307             match strings {
308                 [] => (),
309                 [s1] => {
310                     if let Some(from) =
311                         memchr::memmem::find(&self.input.as_bytes()[self.pos..], s1.as_bytes())
312                     {
313                         self.pos += from;
314                         return true;
315                     }
316                 }
317                 [s1, s2] if !s1.is_empty() && !s2.is_empty() => {
318                     let b1 = s1.as_bytes()[0];
319                     let b2 = s2.as_bytes()[0];
320                     let miter = memchr::memchr2_iter(b1, b2, &self.input.as_bytes()[self.pos..]);
321                     for from in miter {
322                         let start = &self.input[self.pos + from..];
323                         if start.starts_with(s1) || start.starts_with(s2) {
324                             self.pos += from;
325                             return true;
326                         }
327                     }
328                 }
329                 [s1, s2, s3] if !s1.is_empty() && !s2.is_empty() && s3.is_empty() => {
330                     let b1 = s1.as_bytes()[0];
331                     let b2 = s2.as_bytes()[0];
332                     let b3 = s2.as_bytes()[0];
333                     let miter =
334                         memchr::memchr3_iter(b1, b2, b3, &self.input.as_bytes()[self.pos..]);
335                     for from in miter {
336                         let start = &self.input[self.pos + from..];
337                         if start.starts_with(s1) || start.starts_with(s2) || start.starts_with(s3) {
338                             self.pos += from;
339                             return true;
340                         }
341                     }
342                 }
343                 _ => {
344                     return self.skip_until_basic(strings);
345                 }
346             }
347             self.pos = self.input.len();
348             false
349         }
350     }
351 
352     #[inline]
skip_until_basic(&mut self, strings: &[&str]) -> bool353     fn skip_until_basic(&mut self, strings: &[&str]) -> bool {
354         // TODO: optimize with Aho-Corasick, e.g. https://crates.io/crates/daachorse?
355         for from in self.pos..self.input.len() {
356             let bytes = if let Some(string) = self.input.get(from..) {
357                 string.as_bytes()
358             } else {
359                 continue;
360             };
361 
362             for slice in strings.iter() {
363                 let to = slice.len();
364                 if Some(slice.as_bytes()) == bytes.get(0..to) {
365                     self.pos = from;
366                     return true;
367                 }
368             }
369         }
370 
371         self.pos = self.input.len();
372         false
373     }
374 
375     /// Matches the char at the `Position` against a specified character and returns `true` if a match
376     /// was made. If no match was made, returns `false`.
377     /// `pos` will not be updated in either case.
378     #[inline]
match_char(&self, c: char) -> bool379     pub(crate) fn match_char(&self, c: char) -> bool {
380         matches!(self.input[self.pos..].chars().next(), Some(cc) if c == cc)
381     }
382 
383     /// Matches the char at the `Position` against a filter function and returns `true` if a match
384     /// was made. If no match was made, returns `false` and `pos` will not be updated.
385     #[inline]
match_char_by<F>(&mut self, f: F) -> bool where F: FnOnce(char) -> bool,386     pub(crate) fn match_char_by<F>(&mut self, f: F) -> bool
387     where
388         F: FnOnce(char) -> bool,
389     {
390         if let Some(c) = self.input[self.pos..].chars().next() {
391             if f(c) {
392                 self.pos += c.len_utf8();
393                 true
394             } else {
395                 false
396             }
397         } else {
398             false
399         }
400     }
401 
402     /// Matches `string` from the `Position` and returns `true` if a match was made or `false`
403     /// otherwise. If no match was made, `pos` will not be updated.
404     #[inline]
match_string(&mut self, string: &str) -> bool405     pub(crate) fn match_string(&mut self, string: &str) -> bool {
406         let to = self.pos + string.len();
407 
408         if Some(string.as_bytes()) == self.input.as_bytes().get(self.pos..to) {
409             self.pos = to;
410             true
411         } else {
412             false
413         }
414     }
415 
416     /// Case-insensitively matches `string` from the `Position` and returns `true` if a match was
417     /// made or `false` otherwise. If no match was made, `pos` will not be updated.
418     #[inline]
match_insensitive(&mut self, string: &str) -> bool419     pub(crate) fn match_insensitive(&mut self, string: &str) -> bool {
420         let matched = {
421             let slice = &self.input[self.pos..];
422             if let Some(slice) = slice.get(0..string.len()) {
423                 slice.eq_ignore_ascii_case(string)
424             } else {
425                 false
426             }
427         };
428 
429         if matched {
430             self.pos += string.len();
431             true
432         } else {
433             false
434         }
435     }
436 
437     /// Matches `char` `range` from the `Position` and returns `true` if a match was made or `false`
438     /// otherwise. If no match was made, `pos` will not be updated.
439     #[inline]
match_range(&mut self, range: Range<char>) -> bool440     pub(crate) fn match_range(&mut self, range: Range<char>) -> bool {
441         if let Some(c) = self.input[self.pos..].chars().next() {
442             if range.start <= c && c <= range.end {
443                 self.pos += c.len_utf8();
444                 return true;
445             }
446         }
447 
448         false
449     }
450 }
451 
452 impl<'i> fmt::Debug for Position<'i> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result453     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
454         f.debug_struct("Position").field("pos", &self.pos).finish()
455     }
456 }
457 
458 impl<'i> PartialEq for Position<'i> {
eq(&self, other: &Position<'i>) -> bool459     fn eq(&self, other: &Position<'i>) -> bool {
460         ptr::eq(self.input, other.input) && self.pos == other.pos
461     }
462 }
463 
464 impl<'i> Eq for Position<'i> {}
465 
466 impl<'i> PartialOrd for Position<'i> {
partial_cmp(&self, other: &Position<'i>) -> Option<Ordering>467     fn partial_cmp(&self, other: &Position<'i>) -> Option<Ordering> {
468         if ptr::eq(self.input, other.input) {
469             self.pos.partial_cmp(&other.pos)
470         } else {
471             None
472         }
473     }
474 }
475 
476 impl<'i> Ord for Position<'i> {
cmp(&self, other: &Position<'i>) -> Ordering477     fn cmp(&self, other: &Position<'i>) -> Ordering {
478         self.partial_cmp(other)
479             .expect("cannot compare positions from different strs")
480     }
481 }
482 
483 impl<'i> Hash for Position<'i> {
hash<H: Hasher>(&self, state: &mut H)484     fn hash<H: Hasher>(&self, state: &mut H) {
485         (self.input as *const str).hash(state);
486         self.pos.hash(state);
487     }
488 }
489 
490 #[cfg(test)]
491 mod tests {
492     use super::*;
493 
494     #[test]
empty()495     fn empty() {
496         let input = "";
497         assert!(Position::new(input, 0).unwrap().match_string(""));
498         assert!(!Position::new(input, 0).unwrap().match_string("a"));
499     }
500 
501     #[test]
parts()502     fn parts() {
503         let input = "asdasdf";
504 
505         assert!(Position::new(input, 0).unwrap().match_string("asd"));
506         assert!(Position::new(input, 3).unwrap().match_string("asdf"));
507     }
508 
509     #[test]
line_col()510     fn line_col() {
511         let input = "a\rb\nc\r\nd嗨";
512 
513         assert_eq!(Position::new(input, 0).unwrap().line_col(), (1, 1));
514         assert_eq!(Position::new(input, 1).unwrap().line_col(), (1, 2));
515         assert_eq!(Position::new(input, 2).unwrap().line_col(), (1, 3));
516         assert_eq!(Position::new(input, 3).unwrap().line_col(), (1, 4));
517         assert_eq!(Position::new(input, 4).unwrap().line_col(), (2, 1));
518         assert_eq!(Position::new(input, 5).unwrap().line_col(), (2, 2));
519         assert_eq!(Position::new(input, 6).unwrap().line_col(), (2, 3));
520         assert_eq!(Position::new(input, 7).unwrap().line_col(), (3, 1));
521         assert_eq!(Position::new(input, 8).unwrap().line_col(), (3, 2));
522         assert_eq!(Position::new(input, 11).unwrap().line_col(), (3, 3));
523         let input = "abcd嗨";
524         assert_eq!(Position::new(input, 7).unwrap().line_col(), (1, 6));
525     }
526 
527     #[test]
line_of()528     fn line_of() {
529         let input = "a\rb\nc\r\nd嗨";
530 
531         assert_eq!(Position::new(input, 0).unwrap().line_of(), "a\rb\n");
532         assert_eq!(Position::new(input, 1).unwrap().line_of(), "a\rb\n");
533         assert_eq!(Position::new(input, 2).unwrap().line_of(), "a\rb\n");
534         assert_eq!(Position::new(input, 3).unwrap().line_of(), "a\rb\n");
535         assert_eq!(Position::new(input, 4).unwrap().line_of(), "c\r\n");
536         assert_eq!(Position::new(input, 5).unwrap().line_of(), "c\r\n");
537         assert_eq!(Position::new(input, 6).unwrap().line_of(), "c\r\n");
538         assert_eq!(Position::new(input, 7).unwrap().line_of(), "d嗨");
539         assert_eq!(Position::new(input, 8).unwrap().line_of(), "d嗨");
540         assert_eq!(Position::new(input, 11).unwrap().line_of(), "d嗨");
541     }
542 
543     #[test]
line_of_empty()544     fn line_of_empty() {
545         let input = "";
546 
547         assert_eq!(Position::new(input, 0).unwrap().line_of(), "");
548     }
549 
550     #[test]
line_of_new_line()551     fn line_of_new_line() {
552         let input = "\n";
553 
554         assert_eq!(Position::new(input, 0).unwrap().line_of(), "\n");
555     }
556 
557     #[test]
line_of_between_new_line()558     fn line_of_between_new_line() {
559         let input = "\n\n";
560 
561         assert_eq!(Position::new(input, 1).unwrap().line_of(), "\n");
562     }
563 
measure_skip(input: &str, pos: usize, n: usize) -> Option<usize>564     fn measure_skip(input: &str, pos: usize, n: usize) -> Option<usize> {
565         let mut p = Position::new(input, pos).unwrap();
566         if p.skip(n) {
567             Some(p.pos - pos)
568         } else {
569             None
570         }
571     }
572 
573     #[test]
skip_empty()574     fn skip_empty() {
575         let input = "";
576 
577         assert_eq!(measure_skip(input, 0, 0), Some(0));
578         assert_eq!(measure_skip(input, 0, 1), None);
579     }
580 
581     #[test]
skip()582     fn skip() {
583         let input = "d嗨";
584 
585         assert_eq!(measure_skip(input, 0, 0), Some(0));
586         assert_eq!(measure_skip(input, 0, 1), Some(1));
587         assert_eq!(measure_skip(input, 1, 1), Some(3));
588     }
589 
590     #[test]
skip_until()591     fn skip_until() {
592         let input = "ab ac";
593         let pos = Position::from_start(input);
594 
595         let mut test_pos = pos;
596         test_pos.skip_until(&["a", "b"]);
597         assert_eq!(test_pos.pos(), 0);
598 
599         test_pos = pos;
600         test_pos.skip_until(&["b"]);
601         assert_eq!(test_pos.pos(), 1);
602 
603         test_pos = pos;
604         test_pos.skip_until(&["ab"]);
605         assert_eq!(test_pos.pos(), 0);
606 
607         test_pos = pos;
608         test_pos.skip_until(&["ac", "z"]);
609         assert_eq!(test_pos.pos(), 3);
610 
611         test_pos = pos;
612         assert!(!test_pos.skip_until(&["z"]));
613         assert_eq!(test_pos.pos(), 5);
614     }
615 
616     #[test]
match_range()617     fn match_range() {
618         let input = "b";
619 
620         assert!(Position::new(input, 0).unwrap().match_range('a'..'c'));
621         assert!(Position::new(input, 0).unwrap().match_range('b'..'b'));
622         assert!(!Position::new(input, 0).unwrap().match_range('a'..'a'));
623         assert!(!Position::new(input, 0).unwrap().match_range('c'..'c'));
624         assert!(Position::new(input, 0).unwrap().match_range('a'..'嗨'));
625     }
626 
627     #[test]
match_insensitive()628     fn match_insensitive() {
629         let input = "AsdASdF";
630 
631         assert!(Position::new(input, 0).unwrap().match_insensitive("asd"));
632         assert!(Position::new(input, 3).unwrap().match_insensitive("asdf"));
633     }
634 
635     #[test]
cmp()636     fn cmp() {
637         let input = "a";
638         let start = Position::from_start(input);
639         let mut end = start;
640 
641         assert!(end.skip(1));
642         let result = start.cmp(&end);
643 
644         assert_eq!(result, Ordering::Less);
645     }
646 
647     #[test]
648     #[should_panic]
cmp_panic()649     fn cmp_panic() {
650         let input1 = "a";
651         let input2 = "b";
652         let pos1 = Position::from_start(input1);
653         let pos2 = Position::from_start(input2);
654 
655         let _ = pos1.cmp(&pos2);
656     }
657 
658     #[test]
659     #[cfg(feature = "std")]
hash()660     fn hash() {
661         use std::collections::HashSet;
662 
663         let input = "a";
664         let start = Position::from_start(input);
665         let mut positions = HashSet::new();
666 
667         positions.insert(start);
668     }
669 }
670