1 macro_rules! define_set { 2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, 3 $(#[$doc_regexset_example:meta])* ) => { 4 pub mod $name { 5 use std::fmt; 6 use std::iter; 7 use std::slice; 8 use std::vec; 9 10 use crate::error::Error; 11 use crate::exec::Exec; 12 use crate::re_builder::$builder_mod::RegexSetBuilder; 13 use crate::re_trait::RegularExpression; 14 15 /// Match multiple (possibly overlapping) regular expressions in a single scan. 16 /// 17 /// A regex set corresponds to the union of two or more regular expressions. 18 /// That is, a regex set will match text where at least one of its 19 /// constituent regular expressions matches. A regex set as its formulated here 20 /// provides a touch more power: it will also report *which* regular 21 /// expressions in the set match. Indeed, this is the key difference between 22 /// regex sets and a single `Regex` with many alternates, since only one 23 /// alternate can match at a time. 24 /// 25 /// For example, consider regular expressions to match email addresses and 26 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a 27 /// regex set is constructed from those regexes, then searching the text 28 /// `foo@example.com` will report both regexes as matching. Of course, one 29 /// could accomplish this by compiling each regex on its own and doing two 30 /// searches over the text. The key advantage of using a regex set is that it 31 /// will report the matching regexes using a *single pass through the text*. 32 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL 33 /// router for a complex web application or a user agent matcher), then a regex 34 /// set can realize huge performance gains. 35 /// 36 /// # Example 37 /// 38 /// This shows how the above two regexes (for matching email addresses and 39 /// domains) might work: 40 /// 41 $(#[$doc_regexset_example])* 42 /// 43 /// Note that it would be possible to adapt the above example to using `Regex` 44 /// with an expression like: 45 /// 46 /// ```text 47 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) 48 /// ``` 49 /// 50 /// After a match, one could then inspect the capture groups to figure out 51 /// which alternates matched. The problem is that it is hard to make this 52 /// approach scale when there are many regexes since the overlap between each 53 /// alternate isn't always obvious to reason about. 54 /// 55 /// # Limitations 56 /// 57 /// Regex sets are limited to answering the following two questions: 58 /// 59 /// 1. Does any regex in the set match? 60 /// 2. If so, which regexes in the set match? 61 /// 62 /// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) 63 /// instead of (2) since the matching engines can stop after the first match 64 /// is found. 65 /// 66 /// You cannot directly extract [`Match`][crate::Match] or 67 /// [`Captures`][crate::Captures] objects from a regex set. If you need these 68 /// operations, the recommended approach is to compile each pattern in the set 69 /// independently and scan the exact same input a second time with those 70 /// independently compiled patterns: 71 /// 72 /// ```rust 73 /// use regex::{Regex, RegexSet}; 74 /// 75 /// let patterns = ["foo", "bar"]; 76 /// // Both patterns will match different ranges of this string. 77 /// let text = "barfoo"; 78 /// 79 /// // Compile a set matching any of our patterns. 80 /// let set = RegexSet::new(&patterns).unwrap(); 81 /// // Compile each pattern independently. 82 /// let regexes: Vec<_> = set.patterns().iter() 83 /// .map(|pat| Regex::new(pat).unwrap()) 84 /// .collect(); 85 /// 86 /// // Match against the whole set first and identify the individual 87 /// // matching patterns. 88 /// let matches: Vec<&str> = set.matches(text).into_iter() 89 /// // Dereference the match index to get the corresponding 90 /// // compiled pattern. 91 /// .map(|match_idx| ®exes[match_idx]) 92 /// // To get match locations or any other info, we then have to search 93 /// // the exact same text again, using our separately-compiled pattern. 94 /// .map(|pat| pat.find(text).unwrap().as_str()) 95 /// .collect(); 96 /// 97 /// // Matches arrive in the order the constituent patterns were declared, 98 /// // not the order they appear in the input. 99 /// assert_eq!(vec!["foo", "bar"], matches); 100 /// ``` 101 /// 102 /// # Performance 103 /// 104 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, 105 /// search takes `O(mn)` time, where `m` is proportional to the size of the 106 /// regex set and `n` is proportional to the length of the search text. 107 #[derive(Clone)] 108 pub struct RegexSet(Exec); 109 110 impl RegexSet { 111 /// Create a new regex set with the given regular expressions. 112 /// 113 /// This takes an iterator of `S`, where `S` is something that can produce 114 /// a `&str`. If any of the strings in the iterator are not valid regular 115 /// expressions, then an error is returned. 116 /// 117 /// # Example 118 /// 119 /// Create a new regex set from an iterator of strings: 120 /// 121 /// ```rust 122 /// # use regex::RegexSet; 123 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 124 /// assert!(set.is_match("foo")); 125 /// ``` 126 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> 127 where S: AsRef<str>, I: IntoIterator<Item=S> { 128 RegexSetBuilder::new(exprs).build() 129 } 130 131 /// Create a new empty regex set. 132 /// 133 /// # Example 134 /// 135 /// ```rust 136 /// # use regex::RegexSet; 137 /// let set = RegexSet::empty(); 138 /// assert!(set.is_empty()); 139 /// ``` 140 pub fn empty() -> RegexSet { 141 RegexSetBuilder::new(&[""; 0]).build().unwrap() 142 } 143 144 /// Returns true if and only if one of the regexes in this set matches 145 /// the text given. 146 /// 147 /// This method should be preferred if you only need to test whether any 148 /// of the regexes in the set should match, but don't care about *which* 149 /// regexes matched. This is because the underlying matching engine will 150 /// quit immediately after seeing the first match instead of continuing to 151 /// find all matches. 152 /// 153 /// Note that as with searches using `Regex`, the expression is unanchored 154 /// by default. That is, if the regex does not start with `^` or `\A`, or 155 /// end with `$` or `\z`, then it is permitted to match anywhere in the 156 /// text. 157 /// 158 /// # Example 159 /// 160 /// Tests whether a set matches some text: 161 /// 162 /// ```rust 163 /// # use regex::RegexSet; 164 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 165 /// assert!(set.is_match("foo")); 166 /// assert!(!set.is_match("☃")); 167 /// ``` 168 pub fn is_match(&self, text: $text_ty) -> bool { 169 self.is_match_at(text, 0) 170 } 171 172 /// Returns the same as is_match, but starts the search at the given 173 /// offset. 174 /// 175 /// The significance of the starting point is that it takes the surrounding 176 /// context into consideration. For example, the `\A` anchor can only 177 /// match when `start == 0`. 178 #[doc(hidden)] 179 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { 180 self.0.searcher().is_match_at($as_bytes(text), start) 181 } 182 183 /// Returns the set of regular expressions that match in the given text. 184 /// 185 /// The set returned contains the index of each regular expression that 186 /// matches in the given text. The index is in correspondence with the 187 /// order of regular expressions given to `RegexSet`'s constructor. 188 /// 189 /// The set can also be used to iterate over the matched indices. 190 /// 191 /// Note that as with searches using `Regex`, the expression is unanchored 192 /// by default. That is, if the regex does not start with `^` or `\A`, or 193 /// end with `$` or `\z`, then it is permitted to match anywhere in the 194 /// text. 195 /// 196 /// # Example 197 /// 198 /// Tests which regular expressions match the given text: 199 /// 200 /// ```rust 201 /// # use regex::RegexSet; 202 /// let set = RegexSet::new(&[ 203 /// r"\w+", 204 /// r"\d+", 205 /// r"\pL+", 206 /// r"foo", 207 /// r"bar", 208 /// r"barfoo", 209 /// r"foobar", 210 /// ]).unwrap(); 211 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); 212 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); 213 /// 214 /// // You can also test whether a particular regex matched: 215 /// let matches = set.matches("foobar"); 216 /// assert!(!matches.matched(5)); 217 /// assert!(matches.matched(6)); 218 /// ``` 219 pub fn matches(&self, text: $text_ty) -> SetMatches { 220 let mut matches = vec![false; self.0.regex_strings().len()]; 221 let any = self.read_matches_at(&mut matches, text, 0); 222 SetMatches { 223 matched_any: any, 224 matches: matches, 225 } 226 } 227 228 /// Returns the same as matches, but starts the search at the given 229 /// offset and stores the matches into the slice given. 230 /// 231 /// The significance of the starting point is that it takes the surrounding 232 /// context into consideration. For example, the `\A` anchor can only 233 /// match when `start == 0`. 234 /// 235 /// `matches` must have a length that is at least the number of regexes 236 /// in this set. 237 /// 238 /// This method returns true if and only if at least one member of 239 /// `matches` is true after executing the set against `text`. 240 #[doc(hidden)] 241 pub fn read_matches_at( 242 &self, 243 matches: &mut [bool], 244 text: $text_ty, 245 start: usize, 246 ) -> bool { 247 self.0.searcher().many_matches_at(matches, $as_bytes(text), start) 248 } 249 250 /// Returns the total number of regular expressions in this set. 251 pub fn len(&self) -> usize { 252 self.0.regex_strings().len() 253 } 254 255 /// Returns `true` if this set contains no regular expressions. 256 pub fn is_empty(&self) -> bool { 257 self.0.regex_strings().is_empty() 258 } 259 260 /// Returns the patterns that this set will match on. 261 /// 262 /// This function can be used to determine the pattern for a match. The 263 /// slice returned has exactly as many patterns givens to this regex set, 264 /// and the order of the slice is the same as the order of the patterns 265 /// provided to the set. 266 /// 267 /// # Example 268 /// 269 /// ```rust 270 /// # use regex::RegexSet; 271 /// let set = RegexSet::new(&[ 272 /// r"\w+", 273 /// r"\d+", 274 /// r"\pL+", 275 /// r"foo", 276 /// r"bar", 277 /// r"barfoo", 278 /// r"foobar", 279 /// ]).unwrap(); 280 /// let matches: Vec<_> = set 281 /// .matches("foobar") 282 /// .into_iter() 283 /// .map(|match_idx| &set.patterns()[match_idx]) 284 /// .collect(); 285 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); 286 /// ``` 287 pub fn patterns(&self) -> &[String] { 288 self.0.regex_strings() 289 } 290 } 291 292 /// A set of matches returned by a regex set. 293 #[derive(Clone, Debug)] 294 pub struct SetMatches { 295 matched_any: bool, 296 matches: Vec<bool>, 297 } 298 299 impl SetMatches { 300 /// Whether this set contains any matches. 301 pub fn matched_any(&self) -> bool { 302 self.matched_any 303 } 304 305 /// Whether the regex at the given index matched. 306 /// 307 /// The index for a regex is determined by its insertion order upon the 308 /// initial construction of a `RegexSet`, starting at `0`. 309 /// 310 /// # Panics 311 /// 312 /// If `regex_index` is greater than or equal to `self.len()`. 313 pub fn matched(&self, regex_index: usize) -> bool { 314 self.matches[regex_index] 315 } 316 317 /// The total number of regexes in the set that created these matches. 318 pub fn len(&self) -> usize { 319 self.matches.len() 320 } 321 322 /// Returns an iterator over indexes in the regex that matched. 323 /// 324 /// This will always produces matches in ascending order of index, where 325 /// the index corresponds to the index of the regex that matched with 326 /// respect to its position when initially building the set. 327 pub fn iter(&self) -> SetMatchesIter<'_> { 328 SetMatchesIter((&*self.matches).into_iter().enumerate()) 329 } 330 } 331 332 impl IntoIterator for SetMatches { 333 type IntoIter = SetMatchesIntoIter; 334 type Item = usize; 335 336 fn into_iter(self) -> Self::IntoIter { 337 SetMatchesIntoIter(self.matches.into_iter().enumerate()) 338 } 339 } 340 341 impl<'a> IntoIterator for &'a SetMatches { 342 type IntoIter = SetMatchesIter<'a>; 343 type Item = usize; 344 345 fn into_iter(self) -> Self::IntoIter { 346 self.iter() 347 } 348 } 349 350 /// An owned iterator over the set of matches from a regex set. 351 /// 352 /// This will always produces matches in ascending order of index, where the 353 /// index corresponds to the index of the regex that matched with respect to 354 /// its position when initially building the set. 355 #[derive(Debug)] 356 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); 357 358 impl Iterator for SetMatchesIntoIter { 359 type Item = usize; 360 361 fn next(&mut self) -> Option<usize> { 362 loop { 363 match self.0.next() { 364 None => return None, 365 Some((_, false)) => {} 366 Some((i, true)) => return Some(i), 367 } 368 } 369 } 370 371 fn size_hint(&self) -> (usize, Option<usize>) { 372 self.0.size_hint() 373 } 374 } 375 376 impl DoubleEndedIterator for SetMatchesIntoIter { 377 fn next_back(&mut self) -> Option<usize> { 378 loop { 379 match self.0.next_back() { 380 None => return None, 381 Some((_, false)) => {} 382 Some((i, true)) => return Some(i), 383 } 384 } 385 } 386 } 387 388 impl iter::FusedIterator for SetMatchesIntoIter {} 389 390 /// A borrowed iterator over the set of matches from a regex set. 391 /// 392 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. 393 /// 394 /// This will always produces matches in ascending order of index, where the 395 /// index corresponds to the index of the regex that matched with respect to 396 /// its position when initially building the set. 397 #[derive(Clone, Debug)] 398 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); 399 400 impl<'a> Iterator for SetMatchesIter<'a> { 401 type Item = usize; 402 403 fn next(&mut self) -> Option<usize> { 404 loop { 405 match self.0.next() { 406 None => return None, 407 Some((_, &false)) => {} 408 Some((i, &true)) => return Some(i), 409 } 410 } 411 } 412 413 fn size_hint(&self) -> (usize, Option<usize>) { 414 self.0.size_hint() 415 } 416 } 417 418 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { 419 fn next_back(&mut self) -> Option<usize> { 420 loop { 421 match self.0.next_back() { 422 None => return None, 423 Some((_, &false)) => {} 424 Some((i, &true)) => return Some(i), 425 } 426 } 427 } 428 } 429 430 impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} 431 432 #[doc(hidden)] 433 impl From<Exec> for RegexSet { 434 fn from(exec: Exec) -> Self { 435 RegexSet(exec) 436 } 437 } 438 439 impl fmt::Debug for RegexSet { 440 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 441 write!(f, "RegexSet({:?})", self.0.regex_strings()) 442 } 443 } 444 445 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } 446 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } 447 } 448 } 449 } 450 451 define_set! { 452 unicode, 453 set_unicode, 454 &str, 455 as_bytes_str, 456 /// ```rust 457 /// # use regex::RegexSet; 458 /// let set = RegexSet::new(&[ 459 /// r"[a-z]+@[a-z]+\.(com|org|net)", 460 /// r"[a-z]+\.(com|org|net)", 461 /// ]).unwrap(); 462 /// 463 /// // Ask whether any regexes in the set match. 464 /// assert!(set.is_match("foo@example.com")); 465 /// 466 /// // Identify which regexes in the set match. 467 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); 468 /// assert_eq!(vec![0, 1], matches); 469 /// 470 /// // Try again, but with text that only matches one of the regexes. 471 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); 472 /// assert_eq!(vec![1], matches); 473 /// 474 /// // Try again, but with text that doesn't match any regex in the set. 475 /// let matches: Vec<_> = set.matches("example").into_iter().collect(); 476 /// assert!(matches.is_empty()); 477 /// ``` 478 } 479 480 define_set! { 481 bytes, 482 set_bytes, 483 &[u8], 484 as_bytes_bytes, 485 /// ```rust 486 /// # use regex::bytes::RegexSet; 487 /// let set = RegexSet::new(&[ 488 /// r"[a-z]+@[a-z]+\.(com|org|net)", 489 /// r"[a-z]+\.(com|org|net)", 490 /// ]).unwrap(); 491 /// 492 /// // Ask whether any regexes in the set match. 493 /// assert!(set.is_match(b"foo@example.com")); 494 /// 495 /// // Identify which regexes in the set match. 496 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); 497 /// assert_eq!(vec![0, 1], matches); 498 /// 499 /// // Try again, but with text that only matches one of the regexes. 500 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); 501 /// assert_eq!(vec![1], matches); 502 /// 503 /// // Try again, but with text that doesn't match any regex in the set. 504 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); 505 /// assert!(matches.is_empty()); 506 /// ``` 507 } 508