• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::fmt;
2 use std::iter::FusedIterator;
3 
4 /// Slot is a single saved capture location. Note that there are two slots for
5 /// every capture in a regular expression (one slot each for the start and end
6 /// of the capture).
7 pub type Slot = Option<usize>;
8 
9 /// Locations represents the offsets of each capturing group in a regex for
10 /// a single match.
11 ///
12 /// Unlike `Captures`, a `Locations` value only stores offsets.
13 #[doc(hidden)]
14 #[derive(Clone, Debug)]
15 pub struct Locations(Vec<Slot>);
16 
17 impl Locations {
18     /// Returns the start and end positions of the Nth capture group. Returns
19     /// `None` if `i` is not a valid capture group or if the capture group did
20     /// not match anything. The positions returned are *always* byte indices
21     /// with respect to the original string matched.
pos(&self, i: usize) -> Option<(usize, usize)>22     pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
23         let (s, e) = (i * 2, i * 2 + 1);
24         match (self.0.get(s), self.0.get(e)) {
25             (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
26             _ => None,
27         }
28     }
29 
30     /// Creates an iterator of all the capture group positions in order of
31     /// appearance in the regular expression. Positions are byte indices
32     /// in terms of the original string matched.
iter(&self) -> SubCapturesPosIter<'_>33     pub fn iter(&self) -> SubCapturesPosIter<'_> {
34         SubCapturesPosIter { idx: 0, locs: self }
35     }
36 
37     /// Returns the total number of capturing groups.
38     ///
39     /// This is always at least `1` since every regex has at least `1`
40     /// capturing group that corresponds to the entire match.
len(&self) -> usize41     pub fn len(&self) -> usize {
42         self.0.len() / 2
43     }
44 
45     /// Return the individual slots as a slice.
as_slots(&mut self) -> &mut [Slot]46     pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
47         &mut self.0
48     }
49 }
50 
51 /// An iterator over capture group positions for a particular match of a
52 /// regular expression.
53 ///
54 /// Positions are byte indices in terms of the original string matched.
55 ///
56 /// `'c` is the lifetime of the captures.
57 #[derive(Clone, Debug)]
58 pub struct SubCapturesPosIter<'c> {
59     idx: usize,
60     locs: &'c Locations,
61 }
62 
63 impl<'c> Iterator for SubCapturesPosIter<'c> {
64     type Item = Option<(usize, usize)>;
65 
next(&mut self) -> Option<Option<(usize, usize)>>66     fn next(&mut self) -> Option<Option<(usize, usize)>> {
67         if self.idx >= self.locs.len() {
68             return None;
69         }
70         let x = match self.locs.pos(self.idx) {
71             None => Some(None),
72             Some((s, e)) => Some(Some((s, e))),
73         };
74         self.idx += 1;
75         x
76     }
77 
size_hint(&self) -> (usize, Option<usize>)78     fn size_hint(&self) -> (usize, Option<usize>) {
79         let len = self.locs.len() - self.idx;
80         (len, Some(len))
81     }
82 
count(self) -> usize83     fn count(self) -> usize {
84         self.len()
85     }
86 }
87 
88 impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
89 
90 impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
91 
92 /// `RegularExpression` describes types that can implement regex searching.
93 ///
94 /// This trait is my attempt at reducing code duplication and to standardize
95 /// the internal API. Specific duplication that is avoided are the `find`
96 /// and `capture` iterators, which are slightly tricky.
97 ///
98 /// It's not clear whether this trait is worth it, and it also isn't
99 /// clear whether it's useful as a public trait or not. Methods like
100 /// `next_after_empty` reak of bad design, but the rest of the methods seem
101 /// somewhat reasonable. One particular thing this trait would expose would be
102 /// the ability to start the search of a regex anywhere in a haystack, which
103 /// isn't possible in the current public API.
104 pub trait RegularExpression: Sized + fmt::Debug {
105     /// The type of the haystack.
106     type Text: ?Sized + fmt::Debug;
107 
108     /// The number of capture slots in the compiled regular expression. This is
109     /// always two times the number of capture groups (two slots per group).
slots_len(&self) -> usize110     fn slots_len(&self) -> usize;
111 
112     /// Allocates fresh space for all capturing groups in this regex.
locations(&self) -> Locations113     fn locations(&self) -> Locations {
114         Locations(vec![None; self.slots_len()])
115     }
116 
117     /// Returns the position of the next character after `i`.
118     ///
119     /// For example, a haystack with type `&[u8]` probably returns `i+1`,
120     /// whereas a haystack with type `&str` probably returns `i` plus the
121     /// length of the next UTF-8 sequence.
next_after_empty(&self, text: &Self::Text, i: usize) -> usize122     fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
123 
124     /// Returns the location of the shortest match.
shortest_match_at( &self, text: &Self::Text, start: usize, ) -> Option<usize>125     fn shortest_match_at(
126         &self,
127         text: &Self::Text,
128         start: usize,
129     ) -> Option<usize>;
130 
131     /// Returns whether the regex matches the text given.
is_match_at(&self, text: &Self::Text, start: usize) -> bool132     fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
133 
134     /// Returns the leftmost-first match location if one exists.
find_at( &self, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>135     fn find_at(
136         &self,
137         text: &Self::Text,
138         start: usize,
139     ) -> Option<(usize, usize)>;
140 
141     /// Returns the leftmost-first match location if one exists, and also
142     /// fills in any matching capture slot locations.
captures_read_at( &self, locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>143     fn captures_read_at(
144         &self,
145         locs: &mut Locations,
146         text: &Self::Text,
147         start: usize,
148     ) -> Option<(usize, usize)>;
149 
150     /// Returns an iterator over all non-overlapping successive leftmost-first
151     /// matches.
find_iter(self, text: &Self::Text) -> Matches<'_, Self>152     fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
153         Matches { re: self, text, last_end: 0, last_match: None }
154     }
155 
156     /// Returns an iterator over all non-overlapping successive leftmost-first
157     /// matches with captures.
captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self>158     fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
159         CaptureMatches(self.find_iter(text))
160     }
161 }
162 
163 /// An iterator over all non-overlapping successive leftmost-first matches.
164 #[derive(Debug)]
165 pub struct Matches<'t, R>
166 where
167     R: RegularExpression,
168     R::Text: 't,
169 {
170     re: R,
171     text: &'t R::Text,
172     last_end: usize,
173     last_match: Option<usize>,
174 }
175 
176 impl<'t, R> Matches<'t, R>
177 where
178     R: RegularExpression,
179     R::Text: 't,
180 {
181     /// Return the text being searched.
text(&self) -> &'t R::Text182     pub fn text(&self) -> &'t R::Text {
183         self.text
184     }
185 
186     /// Return the underlying regex.
regex(&self) -> &R187     pub fn regex(&self) -> &R {
188         &self.re
189     }
190 }
191 
192 impl<'t, R> Iterator for Matches<'t, R>
193 where
194     R: RegularExpression,
195     R::Text: 't + AsRef<[u8]>,
196 {
197     type Item = (usize, usize);
198 
next(&mut self) -> Option<(usize, usize)>199     fn next(&mut self) -> Option<(usize, usize)> {
200         if self.last_end > self.text.as_ref().len() {
201             return None;
202         }
203         let (s, e) = match self.re.find_at(self.text, self.last_end) {
204             None => return None,
205             Some((s, e)) => (s, e),
206         };
207         if s == e {
208             // This is an empty match. To ensure we make progress, start
209             // the next search at the smallest possible starting position
210             // of the next match following this one.
211             self.last_end = self.re.next_after_empty(self.text, e);
212             // Don't accept empty matches immediately following a match.
213             // Just move on to the next match.
214             if Some(e) == self.last_match {
215                 return self.next();
216             }
217         } else {
218             self.last_end = e;
219         }
220         self.last_match = Some(e);
221         Some((s, e))
222     }
223 }
224 
225 impl<'t, R> FusedIterator for Matches<'t, R>
226 where
227     R: RegularExpression,
228     R::Text: 't + AsRef<[u8]>,
229 {
230 }
231 
232 /// An iterator over all non-overlapping successive leftmost-first matches with
233 /// captures.
234 #[derive(Debug)]
235 pub struct CaptureMatches<'t, R>(Matches<'t, R>)
236 where
237     R: RegularExpression,
238     R::Text: 't;
239 
240 impl<'t, R> CaptureMatches<'t, R>
241 where
242     R: RegularExpression,
243     R::Text: 't,
244 {
245     /// Return the text being searched.
text(&self) -> &'t R::Text246     pub fn text(&self) -> &'t R::Text {
247         self.0.text()
248     }
249 
250     /// Return the underlying regex.
regex(&self) -> &R251     pub fn regex(&self) -> &R {
252         self.0.regex()
253     }
254 }
255 
256 impl<'t, R> Iterator for CaptureMatches<'t, R>
257 where
258     R: RegularExpression,
259     R::Text: 't + AsRef<[u8]>,
260 {
261     type Item = Locations;
262 
next(&mut self) -> Option<Locations>263     fn next(&mut self) -> Option<Locations> {
264         if self.0.last_end > self.0.text.as_ref().len() {
265             return None;
266         }
267         let mut locs = self.0.re.locations();
268         let (s, e) = match self.0.re.captures_read_at(
269             &mut locs,
270             self.0.text,
271             self.0.last_end,
272         ) {
273             None => return None,
274             Some((s, e)) => (s, e),
275         };
276         if s == e {
277             self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
278             if Some(e) == self.0.last_match {
279                 return self.next();
280             }
281         } else {
282             self.0.last_end = e;
283         }
284         self.0.last_match = Some(e);
285         Some(locs)
286     }
287 }
288 
289 impl<'t, R> FusedIterator for CaptureMatches<'t, R>
290 where
291     R: RegularExpression,
292     R::Text: 't + AsRef<[u8]>,
293 {
294 }
295