• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 macro_rules! define_set {
2     ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3      $(#[$doc_regexset_example:meta])* ) => {
4         pub mod $name {
5             use std::fmt;
6             use std::iter;
7             use std::slice;
8             use std::vec;
9 
10             use crate::error::Error;
11             use crate::exec::Exec;
12             use crate::re_builder::$builder_mod::RegexSetBuilder;
13             use crate::re_trait::RegularExpression;
14 
15 /// Match multiple (possibly overlapping) regular expressions in a single scan.
16 ///
17 /// A regex set corresponds to the union of two or more regular expressions.
18 /// That is, a regex set will match text where at least one of its
19 /// constituent regular expressions matches. A regex set as its formulated here
20 /// provides a touch more power: it will also report *which* regular
21 /// expressions in the set match. Indeed, this is the key difference between
22 /// regex sets and a single `Regex` with many alternates, since only one
23 /// alternate can match at a time.
24 ///
25 /// For example, consider regular expressions to match email addresses and
26 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27 /// regex set is constructed from those regexes, then searching the text
28 /// `foo@example.com` will report both regexes as matching. Of course, one
29 /// could accomplish this by compiling each regex on its own and doing two
30 /// searches over the text. The key advantage of using a regex set is that it
31 /// will report the matching regexes using a *single pass through the text*.
32 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33 /// router for a complex web application or a user agent matcher), then a regex
34 /// set can realize huge performance gains.
35 ///
36 /// # Example
37 ///
38 /// This shows how the above two regexes (for matching email addresses and
39 /// domains) might work:
40 ///
41 $(#[$doc_regexset_example])*
42 ///
43 /// Note that it would be possible to adapt the above example to using `Regex`
44 /// with an expression like:
45 ///
46 /// ```text
47 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
48 /// ```
49 ///
50 /// After a match, one could then inspect the capture groups to figure out
51 /// which alternates matched. The problem is that it is hard to make this
52 /// approach scale when there are many regexes since the overlap between each
53 /// alternate isn't always obvious to reason about.
54 ///
55 /// # Limitations
56 ///
57 /// Regex sets are limited to answering the following two questions:
58 ///
59 /// 1. Does any regex in the set match?
60 /// 2. If so, which regexes in the set match?
61 ///
62 /// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
63 /// instead of (2) since the matching engines can stop after the first match
64 /// is found.
65 ///
66 /// You cannot directly extract [`Match`][crate::Match] or
67 /// [`Captures`][crate::Captures] objects from a regex set. If you need these
68 /// operations, the recommended approach is to compile each pattern in the set
69 /// independently and scan the exact same input a second time with those
70 /// independently compiled patterns:
71 ///
72 /// ```rust
73 /// use regex::{Regex, RegexSet};
74 ///
75 /// let patterns = ["foo", "bar"];
76 /// // Both patterns will match different ranges of this string.
77 /// let text = "barfoo";
78 ///
79 /// // Compile a set matching any of our patterns.
80 /// let set = RegexSet::new(&patterns).unwrap();
81 /// // Compile each pattern independently.
82 /// let regexes: Vec<_> = set.patterns().iter()
83 ///     .map(|pat| Regex::new(pat).unwrap())
84 ///     .collect();
85 ///
86 /// // Match against the whole set first and identify the individual
87 /// // matching patterns.
88 /// let matches: Vec<&str> = set.matches(text).into_iter()
89 ///     // Dereference the match index to get the corresponding
90 ///     // compiled pattern.
91 ///     .map(|match_idx| &regexes[match_idx])
92 ///     // To get match locations or any other info, we then have to search
93 ///     // the exact same text again, using our separately-compiled pattern.
94 ///     .map(|pat| pat.find(text).unwrap().as_str())
95 ///     .collect();
96 ///
97 /// // Matches arrive in the order the constituent patterns were declared,
98 /// // not the order they appear in the input.
99 /// assert_eq!(vec!["foo", "bar"], matches);
100 /// ```
101 ///
102 /// # Performance
103 ///
104 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
105 /// search takes `O(mn)` time, where `m` is proportional to the size of the
106 /// regex set and `n` is proportional to the length of the search text.
107 #[derive(Clone)]
108 pub struct RegexSet(Exec);
109 
110 impl RegexSet {
111     /// Create a new regex set with the given regular expressions.
112     ///
113     /// This takes an iterator of `S`, where `S` is something that can produce
114     /// a `&str`. If any of the strings in the iterator are not valid regular
115     /// expressions, then an error is returned.
116     ///
117     /// # Example
118     ///
119     /// Create a new regex set from an iterator of strings:
120     ///
121     /// ```rust
122     /// # use regex::RegexSet;
123     /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
124     /// assert!(set.is_match("foo"));
125     /// ```
126     pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
127             where S: AsRef<str>, I: IntoIterator<Item=S> {
128         RegexSetBuilder::new(exprs).build()
129     }
130 
131     /// Create a new empty regex set.
132     ///
133     /// # Example
134     ///
135     /// ```rust
136     /// # use regex::RegexSet;
137     /// let set = RegexSet::empty();
138     /// assert!(set.is_empty());
139     /// ```
140     pub fn empty() -> RegexSet {
141         RegexSetBuilder::new(&[""; 0]).build().unwrap()
142     }
143 
144     /// Returns true if and only if one of the regexes in this set matches
145     /// the text given.
146     ///
147     /// This method should be preferred if you only need to test whether any
148     /// of the regexes in the set should match, but don't care about *which*
149     /// regexes matched. This is because the underlying matching engine will
150     /// quit immediately after seeing the first match instead of continuing to
151     /// find all matches.
152     ///
153     /// Note that as with searches using `Regex`, the expression is unanchored
154     /// by default. That is, if the regex does not start with `^` or `\A`, or
155     /// end with `$` or `\z`, then it is permitted to match anywhere in the
156     /// text.
157     ///
158     /// # Example
159     ///
160     /// Tests whether a set matches some text:
161     ///
162     /// ```rust
163     /// # use regex::RegexSet;
164     /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
165     /// assert!(set.is_match("foo"));
166     /// assert!(!set.is_match("☃"));
167     /// ```
168     pub fn is_match(&self, text: $text_ty) -> bool {
169         self.is_match_at(text, 0)
170     }
171 
172     /// Returns the same as is_match, but starts the search at the given
173     /// offset.
174     ///
175     /// The significance of the starting point is that it takes the surrounding
176     /// context into consideration. For example, the `\A` anchor can only
177     /// match when `start == 0`.
178     #[doc(hidden)]
179     pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
180         self.0.searcher().is_match_at($as_bytes(text), start)
181     }
182 
183     /// Returns the set of regular expressions that match in the given text.
184     ///
185     /// The set returned contains the index of each regular expression that
186     /// matches in the given text. The index is in correspondence with the
187     /// order of regular expressions given to `RegexSet`'s constructor.
188     ///
189     /// The set can also be used to iterate over the matched indices.
190     ///
191     /// Note that as with searches using `Regex`, the expression is unanchored
192     /// by default. That is, if the regex does not start with `^` or `\A`, or
193     /// end with `$` or `\z`, then it is permitted to match anywhere in the
194     /// text.
195     ///
196     /// # Example
197     ///
198     /// Tests which regular expressions match the given text:
199     ///
200     /// ```rust
201     /// # use regex::RegexSet;
202     /// let set = RegexSet::new(&[
203     ///     r"\w+",
204     ///     r"\d+",
205     ///     r"\pL+",
206     ///     r"foo",
207     ///     r"bar",
208     ///     r"barfoo",
209     ///     r"foobar",
210     /// ]).unwrap();
211     /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
212     /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
213     ///
214     /// // You can also test whether a particular regex matched:
215     /// let matches = set.matches("foobar");
216     /// assert!(!matches.matched(5));
217     /// assert!(matches.matched(6));
218     /// ```
219     pub fn matches(&self, text: $text_ty) -> SetMatches {
220         let mut matches = vec![false; self.0.regex_strings().len()];
221         let any = self.read_matches_at(&mut matches, text, 0);
222         SetMatches {
223             matched_any: any,
224             matches: matches,
225         }
226     }
227 
228     /// Returns the same as matches, but starts the search at the given
229     /// offset and stores the matches into the slice given.
230     ///
231     /// The significance of the starting point is that it takes the surrounding
232     /// context into consideration. For example, the `\A` anchor can only
233     /// match when `start == 0`.
234     ///
235     /// `matches` must have a length that is at least the number of regexes
236     /// in this set.
237     ///
238     /// This method returns true if and only if at least one member of
239     /// `matches` is true after executing the set against `text`.
240     #[doc(hidden)]
241     pub fn read_matches_at(
242         &self,
243         matches: &mut [bool],
244         text: $text_ty,
245         start: usize,
246     ) -> bool {
247         self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
248     }
249 
250     /// Returns the total number of regular expressions in this set.
251     pub fn len(&self) -> usize {
252         self.0.regex_strings().len()
253     }
254 
255     /// Returns `true` if this set contains no regular expressions.
256     pub fn is_empty(&self) -> bool {
257         self.0.regex_strings().is_empty()
258     }
259 
260     /// Returns the patterns that this set will match on.
261     ///
262     /// This function can be used to determine the pattern for a match. The
263     /// slice returned has exactly as many patterns givens to this regex set,
264     /// and the order of the slice is the same as the order of the patterns
265     /// provided to the set.
266     ///
267     /// # Example
268     ///
269     /// ```rust
270     /// # use regex::RegexSet;
271     /// let set = RegexSet::new(&[
272     ///     r"\w+",
273     ///     r"\d+",
274     ///     r"\pL+",
275     ///     r"foo",
276     ///     r"bar",
277     ///     r"barfoo",
278     ///     r"foobar",
279     /// ]).unwrap();
280     /// let matches: Vec<_> = set
281     ///     .matches("foobar")
282     ///     .into_iter()
283     ///     .map(|match_idx| &set.patterns()[match_idx])
284     ///     .collect();
285     /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
286     /// ```
287     pub fn patterns(&self) -> &[String] {
288         self.0.regex_strings()
289     }
290 }
291 
292 /// A set of matches returned by a regex set.
293 #[derive(Clone, Debug)]
294 pub struct SetMatches {
295     matched_any: bool,
296     matches: Vec<bool>,
297 }
298 
299 impl SetMatches {
300     /// Whether this set contains any matches.
301     pub fn matched_any(&self) -> bool {
302         self.matched_any
303     }
304 
305     /// Whether the regex at the given index matched.
306     ///
307     /// The index for a regex is determined by its insertion order upon the
308     /// initial construction of a `RegexSet`, starting at `0`.
309     ///
310     /// # Panics
311     ///
312     /// If `regex_index` is greater than or equal to `self.len()`.
313     pub fn matched(&self, regex_index: usize) -> bool {
314         self.matches[regex_index]
315     }
316 
317     /// The total number of regexes in the set that created these matches.
318     pub fn len(&self) -> usize {
319         self.matches.len()
320     }
321 
322     /// Returns an iterator over indexes in the regex that matched.
323     ///
324     /// This will always produces matches in ascending order of index, where
325     /// the index corresponds to the index of the regex that matched with
326     /// respect to its position when initially building the set.
327     pub fn iter(&self) -> SetMatchesIter<'_> {
328         SetMatchesIter((&*self.matches).into_iter().enumerate())
329     }
330 }
331 
332 impl IntoIterator for SetMatches {
333     type IntoIter = SetMatchesIntoIter;
334     type Item = usize;
335 
336     fn into_iter(self) -> Self::IntoIter {
337         SetMatchesIntoIter(self.matches.into_iter().enumerate())
338     }
339 }
340 
341 impl<'a> IntoIterator for &'a SetMatches {
342     type IntoIter = SetMatchesIter<'a>;
343     type Item = usize;
344 
345     fn into_iter(self) -> Self::IntoIter {
346         self.iter()
347     }
348 }
349 
350 /// An owned iterator over the set of matches from a regex set.
351 ///
352 /// This will always produces matches in ascending order of index, where the
353 /// index corresponds to the index of the regex that matched with respect to
354 /// its position when initially building the set.
355 #[derive(Debug)]
356 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
357 
358 impl Iterator for SetMatchesIntoIter {
359     type Item = usize;
360 
361     fn next(&mut self) -> Option<usize> {
362         loop {
363             match self.0.next() {
364                 None => return None,
365                 Some((_, false)) => {}
366                 Some((i, true)) => return Some(i),
367             }
368         }
369     }
370 
371     fn size_hint(&self) -> (usize, Option<usize>) {
372         self.0.size_hint()
373     }
374 }
375 
376 impl DoubleEndedIterator for SetMatchesIntoIter {
377     fn next_back(&mut self) -> Option<usize> {
378         loop {
379             match self.0.next_back() {
380                 None => return None,
381                 Some((_, false)) => {}
382                 Some((i, true)) => return Some(i),
383             }
384         }
385     }
386 }
387 
388 impl iter::FusedIterator for SetMatchesIntoIter {}
389 
390 /// A borrowed iterator over the set of matches from a regex set.
391 ///
392 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
393 ///
394 /// This will always produces matches in ascending order of index, where the
395 /// index corresponds to the index of the regex that matched with respect to
396 /// its position when initially building the set.
397 #[derive(Clone, Debug)]
398 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
399 
400 impl<'a> Iterator for SetMatchesIter<'a> {
401     type Item = usize;
402 
403     fn next(&mut self) -> Option<usize> {
404         loop {
405             match self.0.next() {
406                 None => return None,
407                 Some((_, &false)) => {}
408                 Some((i, &true)) => return Some(i),
409             }
410         }
411     }
412 
413     fn size_hint(&self) -> (usize, Option<usize>) {
414         self.0.size_hint()
415     }
416 }
417 
418 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
419     fn next_back(&mut self) -> Option<usize> {
420         loop {
421             match self.0.next_back() {
422                 None => return None,
423                 Some((_, &false)) => {}
424                 Some((i, &true)) => return Some(i),
425             }
426         }
427     }
428 }
429 
430 impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
431 
432 #[doc(hidden)]
433 impl From<Exec> for RegexSet {
434     fn from(exec: Exec) -> Self {
435         RegexSet(exec)
436     }
437 }
438 
439 impl fmt::Debug for RegexSet {
440     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
441         write!(f, "RegexSet({:?})", self.0.regex_strings())
442     }
443 }
444 
445 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
446 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
447         }
448     }
449 }
450 
451 define_set! {
452     unicode,
453     set_unicode,
454     &str,
455     as_bytes_str,
456 /// ```rust
457 /// # use regex::RegexSet;
458 /// let set = RegexSet::new(&[
459 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
460 ///     r"[a-z]+\.(com|org|net)",
461 /// ]).unwrap();
462 ///
463 /// // Ask whether any regexes in the set match.
464 /// assert!(set.is_match("foo@example.com"));
465 ///
466 /// // Identify which regexes in the set match.
467 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
468 /// assert_eq!(vec![0, 1], matches);
469 ///
470 /// // Try again, but with text that only matches one of the regexes.
471 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
472 /// assert_eq!(vec![1], matches);
473 ///
474 /// // Try again, but with text that doesn't match any regex in the set.
475 /// let matches: Vec<_> = set.matches("example").into_iter().collect();
476 /// assert!(matches.is_empty());
477 /// ```
478 }
479 
480 define_set! {
481     bytes,
482     set_bytes,
483     &[u8],
484     as_bytes_bytes,
485 /// ```rust
486 /// # use regex::bytes::RegexSet;
487 /// let set = RegexSet::new(&[
488 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
489 ///     r"[a-z]+\.(com|org|net)",
490 /// ]).unwrap();
491 ///
492 /// // Ask whether any regexes in the set match.
493 /// assert!(set.is_match(b"foo@example.com"));
494 ///
495 /// // Identify which regexes in the set match.
496 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
497 /// assert_eq!(vec![0, 1], matches);
498 ///
499 /// // Try again, but with text that only matches one of the regexes.
500 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
501 /// assert_eq!(vec![1], matches);
502 ///
503 /// // Try again, but with text that doesn't match any regex in the set.
504 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
505 /// assert!(matches.is_empty());
506 /// ```
507 }
508