1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::panic::AssertUnwindSafe;
4 use std::sync::Arc;
5
6 #[cfg(feature = "perf-literal")]
7 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
8 use regex_syntax::hir::literal::Literals;
9 use regex_syntax::hir::Hir;
10 use regex_syntax::ParserBuilder;
11
12 use crate::backtrack;
13 use crate::compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use crate::dfa;
16 use crate::error::Error;
17 use crate::input::{ByteInput, CharInput};
18 use crate::literal::LiteralSearcher;
19 use crate::pikevm;
20 use crate::pool::{Pool, PoolGuard};
21 use crate::prog::Program;
22 use crate::re_builder::RegexOptions;
23 use crate::re_bytes;
24 use crate::re_set;
25 use crate::re_trait::{Locations, RegularExpression, Slot};
26 use crate::re_unicode;
27 use crate::utf8::next_utf8;
28
29 /// `Exec` manages the execution of a regular expression.
30 ///
31 /// In particular, this manages the various compiled forms of a single regular
32 /// expression and the choice of which matching engine to use to execute a
33 /// regular expression.
34 #[derive(Debug)]
35 pub struct Exec {
36 /// All read only state.
37 ro: Arc<ExecReadOnly>,
38 /// A pool of reusable values for the various matching engines.
39 ///
40 /// Note that boxing this value is not strictly necessary, but it is an
41 /// easy way to ensure that T does not bloat the stack sized used by a pool
42 /// in the case where T is big. And this turns out to be the case at the
43 /// time of writing for regex's use of this pool. At the time of writing,
44 /// the size of a Regex on the stack is 856 bytes. Boxing this value
45 /// reduces that size to 16 bytes.
46 pool: Box<Pool<ProgramCache>>,
47 }
48
49 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50 /// means it is no longer Sync, but we can now avoid the overhead of
51 /// synchronization to fetch the cache.
52 #[derive(Debug)]
53 pub struct ExecNoSync<'c> {
54 /// All read only state.
55 ro: &'c Arc<ExecReadOnly>,
56 /// Caches for the various matching engines.
57 cache: PoolGuard<'c, ProgramCache>,
58 }
59
60 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61 #[derive(Debug)]
62 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63
64 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65 /// state is determined at compile time and never changes during search.
66 #[derive(Debug)]
67 struct ExecReadOnly {
68 /// The original regular expressions given by the caller to compile.
69 res: Vec<String>,
70 /// A compiled program that is used in the NFA simulation and backtracking.
71 /// It can be byte-based or Unicode codepoint based.
72 ///
73 /// N.B. It is not possibly to make this byte-based from the public API.
74 /// It is only used for testing byte based programs in the NFA simulations.
75 nfa: Program,
76 /// A compiled byte based program for DFA execution. This is only used
77 /// if a DFA can be executed. (Currently, only word boundary assertions are
78 /// not supported.) Note that this program contains an embedded `.*?`
79 /// preceding the first capture group, unless the regex is anchored at the
80 /// beginning.
81 dfa: Program,
82 /// The same as above, except the program is reversed (and there is no
83 /// preceding `.*?`). This is used by the DFA to find the starting location
84 /// of matches.
85 dfa_reverse: Program,
86 /// A set of suffix literals extracted from the regex.
87 ///
88 /// Prefix literals are stored on the `Program`, since they are used inside
89 /// the matching engines.
90 suffixes: LiteralSearcher,
91 /// An Aho-Corasick automaton with leftmost-first match semantics.
92 ///
93 /// This is only set when the entire regex is a simple unanchored
94 /// alternation of literals. We could probably use it more circumstances,
95 /// but this is already hacky enough in this architecture.
96 ///
97 /// N.B. We use u32 as a state ID representation under the assumption that
98 /// if we were to exhaust the ID space, we probably would have long
99 /// surpassed the compilation size limit.
100 #[cfg(feature = "perf-literal")]
101 ac: Option<AhoCorasick<u32>>,
102 /// match_type encodes as much upfront knowledge about how we're going to
103 /// execute a search as possible.
104 match_type: MatchType,
105 }
106
107 /// Facilitates the construction of an executor by exposing various knobs
108 /// to control how a regex is executed and what kinds of resources it's
109 /// permitted to use.
110 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
111 // `Debug`.
112 #[allow(missing_debug_implementations)]
113 pub struct ExecBuilder {
114 options: RegexOptions,
115 match_type: Option<MatchType>,
116 bytes: bool,
117 only_utf8: bool,
118 }
119
120 /// Parsed represents a set of parsed regular expressions and their detected
121 /// literals.
122 struct Parsed {
123 exprs: Vec<Hir>,
124 prefixes: Literals,
125 suffixes: Literals,
126 bytes: bool,
127 }
128
129 impl ExecBuilder {
130 /// Create a regex execution builder.
131 ///
132 /// This uses default settings for everything except the regex itself,
133 /// which must be provided. Further knobs can be set by calling methods,
134 /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self135 pub fn new(re: &str) -> Self {
136 Self::new_many(&[re])
137 }
138
139 /// Like new, but compiles the union of the given regular expressions.
140 ///
141 /// Note that when compiling 2 or more regular expressions, capture groups
142 /// are completely unsupported. (This means both `find` and `captures`
143 /// won't work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,144 pub fn new_many<I, S>(res: I) -> Self
145 where
146 S: AsRef<str>,
147 I: IntoIterator<Item = S>,
148 {
149 let mut opts = RegexOptions::default();
150 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
151 Self::new_options(opts)
152 }
153
154 /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self155 pub fn new_options(opts: RegexOptions) -> Self {
156 ExecBuilder {
157 options: opts,
158 match_type: None,
159 bytes: false,
160 only_utf8: true,
161 }
162 }
163
164 /// Set the matching engine to be automatically determined.
165 ///
166 /// This is the default state and will apply whatever optimizations are
167 /// possible, such as running a DFA.
168 ///
169 /// This overrides whatever was previously set via the `nfa` or
170 /// `bounded_backtracking` methods.
automatic(mut self) -> Self171 pub fn automatic(mut self) -> Self {
172 self.match_type = None;
173 self
174 }
175
176 /// Sets the matching engine to use the NFA algorithm no matter what
177 /// optimizations are possible.
178 ///
179 /// This overrides whatever was previously set via the `automatic` or
180 /// `bounded_backtracking` methods.
nfa(mut self) -> Self181 pub fn nfa(mut self) -> Self {
182 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
183 self
184 }
185
186 /// Sets the matching engine to use a bounded backtracking engine no
187 /// matter what optimizations are possible.
188 ///
189 /// One must use this with care, since the bounded backtracking engine
190 /// uses memory proportion to `len(regex) * len(text)`.
191 ///
192 /// This overrides whatever was previously set via the `automatic` or
193 /// `nfa` methods.
bounded_backtracking(mut self) -> Self194 pub fn bounded_backtracking(mut self) -> Self {
195 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
196 self
197 }
198
199 /// Compiles byte based programs for use with the NFA matching engines.
200 ///
201 /// By default, the NFA engines match on Unicode scalar values. They can
202 /// be made to use byte based programs instead. In general, the byte based
203 /// programs are slower because of a less efficient encoding of character
204 /// classes.
205 ///
206 /// Note that this does not impact DFA matching engines, which always
207 /// execute on bytes.
bytes(mut self, yes: bool) -> Self208 pub fn bytes(mut self, yes: bool) -> Self {
209 self.bytes = yes;
210 self
211 }
212
213 /// When disabled, the program compiled may match arbitrary bytes.
214 ///
215 /// When enabled (the default), all compiled programs exclusively match
216 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self217 pub fn only_utf8(mut self, yes: bool) -> Self {
218 self.only_utf8 = yes;
219 self
220 }
221
222 /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self223 pub fn unicode(mut self, yes: bool) -> Self {
224 self.options.unicode = yes;
225 self
226 }
227
228 /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>229 fn parse(&self) -> Result<Parsed, Error> {
230 let mut exprs = Vec::with_capacity(self.options.pats.len());
231 let mut prefixes = Some(Literals::empty());
232 let mut suffixes = Some(Literals::empty());
233 let mut bytes = false;
234 let is_set = self.options.pats.len() > 1;
235 // If we're compiling a regex set and that set has any anchored
236 // expressions, then disable all literal optimizations.
237 for pat in &self.options.pats {
238 let mut parser = ParserBuilder::new()
239 .octal(self.options.octal)
240 .case_insensitive(self.options.case_insensitive)
241 .multi_line(self.options.multi_line)
242 .dot_matches_new_line(self.options.dot_matches_new_line)
243 .swap_greed(self.options.swap_greed)
244 .ignore_whitespace(self.options.ignore_whitespace)
245 .unicode(self.options.unicode)
246 .allow_invalid_utf8(!self.only_utf8)
247 .nest_limit(self.options.nest_limit)
248 .build();
249 let expr =
250 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
251 bytes = bytes || !expr.is_always_utf8();
252
253 if cfg!(feature = "perf-literal") {
254 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
255 // Partial anchors unfortunately make it hard to use
256 // prefixes, so disable them.
257 prefixes = None;
258 } else if is_set && expr.is_anchored_start() {
259 // Regex sets with anchors do not go well with literal
260 // optimizations.
261 prefixes = None;
262 }
263 prefixes = prefixes.and_then(|mut prefixes| {
264 if !prefixes.union_prefixes(&expr) {
265 None
266 } else {
267 Some(prefixes)
268 }
269 });
270
271 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
272 // Partial anchors unfortunately make it hard to use
273 // suffixes, so disable them.
274 suffixes = None;
275 } else if is_set && expr.is_anchored_end() {
276 // Regex sets with anchors do not go well with literal
277 // optimizations.
278 suffixes = None;
279 }
280 suffixes = suffixes.and_then(|mut suffixes| {
281 if !suffixes.union_suffixes(&expr) {
282 None
283 } else {
284 Some(suffixes)
285 }
286 });
287 }
288 exprs.push(expr);
289 }
290 Ok(Parsed {
291 exprs,
292 prefixes: prefixes.unwrap_or_else(Literals::empty),
293 suffixes: suffixes.unwrap_or_else(Literals::empty),
294 bytes,
295 })
296 }
297
298 /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>299 pub fn build(self) -> Result<Exec, Error> {
300 // Special case when we have no patterns to compile.
301 // This can happen when compiling a regex set.
302 if self.options.pats.is_empty() {
303 let ro = Arc::new(ExecReadOnly {
304 res: vec![],
305 nfa: Program::new(),
306 dfa: Program::new(),
307 dfa_reverse: Program::new(),
308 suffixes: LiteralSearcher::empty(),
309 #[cfg(feature = "perf-literal")]
310 ac: None,
311 match_type: MatchType::Nothing,
312 });
313 let pool = ExecReadOnly::new_pool(&ro);
314 return Ok(Exec { ro, pool });
315 }
316 let parsed = self.parse()?;
317 let mut nfa = Compiler::new()
318 .size_limit(self.options.size_limit)
319 .bytes(self.bytes || parsed.bytes)
320 .only_utf8(self.only_utf8)
321 .compile(&parsed.exprs)?;
322 let mut dfa = Compiler::new()
323 .size_limit(self.options.size_limit)
324 .dfa(true)
325 .only_utf8(self.only_utf8)
326 .compile(&parsed.exprs)?;
327 let mut dfa_reverse = Compiler::new()
328 .size_limit(self.options.size_limit)
329 .dfa(true)
330 .only_utf8(self.only_utf8)
331 .reverse(true)
332 .compile(&parsed.exprs)?;
333
334 #[cfg(feature = "perf-literal")]
335 let ac = self.build_aho_corasick(&parsed);
336 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
337 dfa.prefixes = nfa.prefixes.clone();
338 dfa.dfa_size_limit = self.options.dfa_size_limit;
339 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
340
341 let mut ro = ExecReadOnly {
342 res: self.options.pats,
343 nfa,
344 dfa,
345 dfa_reverse,
346 suffixes: LiteralSearcher::suffixes(parsed.suffixes),
347 #[cfg(feature = "perf-literal")]
348 ac,
349 match_type: MatchType::Nothing,
350 };
351 ro.match_type = ro.choose_match_type(self.match_type);
352
353 let ro = Arc::new(ro);
354 let pool = ExecReadOnly::new_pool(&ro);
355 Ok(Exec { ro, pool })
356 }
357
358 #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>359 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
360 if parsed.exprs.len() != 1 {
361 return None;
362 }
363 let lits = match alternation_literals(&parsed.exprs[0]) {
364 None => return None,
365 Some(lits) => lits,
366 };
367 // If we have a small number of literals, then let Teddy handle
368 // things (see literal/mod.rs).
369 if lits.len() <= 32 {
370 return None;
371 }
372 Some(
373 AhoCorasickBuilder::new()
374 .match_kind(MatchKind::LeftmostFirst)
375 .auto_configure(&lits)
376 .build_with_size::<u32, _, _>(&lits)
377 // This should never happen because we'd long exceed the
378 // compilation limit for regexes first.
379 .expect("AC automaton too big"),
380 )
381 }
382 }
383
384 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
385 type Text = str;
386
slots_len(&self) -> usize387 fn slots_len(&self) -> usize {
388 self.0.slots_len()
389 }
390
next_after_empty(&self, text: &str, i: usize) -> usize391 fn next_after_empty(&self, text: &str, i: usize) -> usize {
392 next_utf8(text.as_bytes(), i)
393 }
394
395 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>396 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
397 self.0.shortest_match_at(text.as_bytes(), start)
398 }
399
400 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool401 fn is_match_at(&self, text: &str, start: usize) -> bool {
402 self.0.is_match_at(text.as_bytes(), start)
403 }
404
405 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>406 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
407 self.0.find_at(text.as_bytes(), start)
408 }
409
410 #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>411 fn captures_read_at(
412 &self,
413 locs: &mut Locations,
414 text: &str,
415 start: usize,
416 ) -> Option<(usize, usize)> {
417 self.0.captures_read_at(locs, text.as_bytes(), start)
418 }
419 }
420
421 impl<'c> RegularExpression for ExecNoSync<'c> {
422 type Text = [u8];
423
424 /// Returns the number of capture slots in the regular expression. (There
425 /// are two slots for every capture group, corresponding to possibly empty
426 /// start and end locations of the capture.)
slots_len(&self) -> usize427 fn slots_len(&self) -> usize {
428 self.ro.nfa.captures.len() * 2
429 }
430
next_after_empty(&self, _text: &[u8], i: usize) -> usize431 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
432 i + 1
433 }
434
435 /// Returns the end of a match location, possibly occurring before the
436 /// end location of the correct leftmost-first match.
437 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>438 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
439 if !self.is_anchor_end_match(text) {
440 return None;
441 }
442 match self.ro.match_type {
443 #[cfg(feature = "perf-literal")]
444 MatchType::Literal(ty) => {
445 self.find_literals(ty, text, start).map(|(_, e)| e)
446 }
447 #[cfg(feature = "perf-dfa")]
448 MatchType::Dfa | MatchType::DfaMany => {
449 match self.shortest_dfa(text, start) {
450 dfa::Result::Match(end) => Some(end),
451 dfa::Result::NoMatch(_) => None,
452 dfa::Result::Quit => self.shortest_nfa(text, start),
453 }
454 }
455 #[cfg(feature = "perf-dfa")]
456 MatchType::DfaAnchoredReverse => {
457 match dfa::Fsm::reverse(
458 &self.ro.dfa_reverse,
459 self.cache.value(),
460 true,
461 &text[start..],
462 text.len() - start,
463 ) {
464 dfa::Result::Match(_) => Some(text.len()),
465 dfa::Result::NoMatch(_) => None,
466 dfa::Result::Quit => self.shortest_nfa(text, start),
467 }
468 }
469 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
470 MatchType::DfaSuffix => {
471 match self.shortest_dfa_reverse_suffix(text, start) {
472 dfa::Result::Match(e) => Some(e),
473 dfa::Result::NoMatch(_) => None,
474 dfa::Result::Quit => self.shortest_nfa(text, start),
475 }
476 }
477 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
478 MatchType::Nothing => None,
479 }
480 }
481
482 /// Returns true if and only if the regex matches text.
483 ///
484 /// For single regular expressions, this is equivalent to calling
485 /// shortest_match(...).is_some().
486 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool487 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
488 if !self.is_anchor_end_match(text) {
489 return false;
490 }
491 // We need to do this dance because shortest_match relies on the NFA
492 // filling in captures[1], but a RegexSet has no captures. In other
493 // words, a RegexSet can't (currently) use shortest_match. ---AG
494 match self.ro.match_type {
495 #[cfg(feature = "perf-literal")]
496 MatchType::Literal(ty) => {
497 self.find_literals(ty, text, start).is_some()
498 }
499 #[cfg(feature = "perf-dfa")]
500 MatchType::Dfa | MatchType::DfaMany => {
501 match self.shortest_dfa(text, start) {
502 dfa::Result::Match(_) => true,
503 dfa::Result::NoMatch(_) => false,
504 dfa::Result::Quit => self.match_nfa(text, start),
505 }
506 }
507 #[cfg(feature = "perf-dfa")]
508 MatchType::DfaAnchoredReverse => {
509 match dfa::Fsm::reverse(
510 &self.ro.dfa_reverse,
511 self.cache.value(),
512 true,
513 &text[start..],
514 text.len() - start,
515 ) {
516 dfa::Result::Match(_) => true,
517 dfa::Result::NoMatch(_) => false,
518 dfa::Result::Quit => self.match_nfa(text, start),
519 }
520 }
521 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
522 MatchType::DfaSuffix => {
523 match self.shortest_dfa_reverse_suffix(text, start) {
524 dfa::Result::Match(_) => true,
525 dfa::Result::NoMatch(_) => false,
526 dfa::Result::Quit => self.match_nfa(text, start),
527 }
528 }
529 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
530 MatchType::Nothing => false,
531 }
532 }
533
534 /// Finds the start and end location of the leftmost-first match, starting
535 /// at the given location.
536 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>537 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
538 if !self.is_anchor_end_match(text) {
539 return None;
540 }
541 match self.ro.match_type {
542 #[cfg(feature = "perf-literal")]
543 MatchType::Literal(ty) => self.find_literals(ty, text, start),
544 #[cfg(feature = "perf-dfa")]
545 MatchType::Dfa => match self.find_dfa_forward(text, start) {
546 dfa::Result::Match((s, e)) => Some((s, e)),
547 dfa::Result::NoMatch(_) => None,
548 dfa::Result::Quit => {
549 self.find_nfa(MatchNfaType::Auto, text, start)
550 }
551 },
552 #[cfg(feature = "perf-dfa")]
553 MatchType::DfaAnchoredReverse => {
554 match self.find_dfa_anchored_reverse(text, start) {
555 dfa::Result::Match((s, e)) => Some((s, e)),
556 dfa::Result::NoMatch(_) => None,
557 dfa::Result::Quit => {
558 self.find_nfa(MatchNfaType::Auto, text, start)
559 }
560 }
561 }
562 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
563 MatchType::DfaSuffix => {
564 match self.find_dfa_reverse_suffix(text, start) {
565 dfa::Result::Match((s, e)) => Some((s, e)),
566 dfa::Result::NoMatch(_) => None,
567 dfa::Result::Quit => {
568 self.find_nfa(MatchNfaType::Auto, text, start)
569 }
570 }
571 }
572 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
573 MatchType::Nothing => None,
574 #[cfg(feature = "perf-dfa")]
575 MatchType::DfaMany => {
576 unreachable!("BUG: RegexSet cannot be used with find")
577 }
578 }
579 }
580
581 /// Finds the start and end location of the leftmost-first match and also
582 /// fills in all matching capture groups.
583 ///
584 /// The number of capture slots given should be equal to the total number
585 /// of capture slots in the compiled program.
586 ///
587 /// Note that the first two slots always correspond to the start and end
588 /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>589 fn captures_read_at(
590 &self,
591 locs: &mut Locations,
592 text: &[u8],
593 start: usize,
594 ) -> Option<(usize, usize)> {
595 let slots = locs.as_slots();
596 for slot in slots.iter_mut() {
597 *slot = None;
598 }
599 // If the caller unnecessarily uses this, then we try to save them
600 // from themselves.
601 match slots.len() {
602 0 => return self.find_at(text, start),
603 2 => {
604 return self.find_at(text, start).map(|(s, e)| {
605 slots[0] = Some(s);
606 slots[1] = Some(e);
607 (s, e)
608 });
609 }
610 _ => {} // fallthrough
611 }
612 if !self.is_anchor_end_match(text) {
613 return None;
614 }
615 match self.ro.match_type {
616 #[cfg(feature = "perf-literal")]
617 MatchType::Literal(ty) => {
618 self.find_literals(ty, text, start).and_then(|(s, e)| {
619 self.captures_nfa_type(
620 MatchNfaType::Auto,
621 slots,
622 text,
623 s,
624 e,
625 )
626 })
627 }
628 #[cfg(feature = "perf-dfa")]
629 MatchType::Dfa => {
630 if self.ro.nfa.is_anchored_start {
631 self.captures_nfa(slots, text, start)
632 } else {
633 match self.find_dfa_forward(text, start) {
634 dfa::Result::Match((s, e)) => self.captures_nfa_type(
635 MatchNfaType::Auto,
636 slots,
637 text,
638 s,
639 e,
640 ),
641 dfa::Result::NoMatch(_) => None,
642 dfa::Result::Quit => {
643 self.captures_nfa(slots, text, start)
644 }
645 }
646 }
647 }
648 #[cfg(feature = "perf-dfa")]
649 MatchType::DfaAnchoredReverse => {
650 match self.find_dfa_anchored_reverse(text, start) {
651 dfa::Result::Match((s, e)) => self.captures_nfa_type(
652 MatchNfaType::Auto,
653 slots,
654 text,
655 s,
656 e,
657 ),
658 dfa::Result::NoMatch(_) => None,
659 dfa::Result::Quit => self.captures_nfa(slots, text, start),
660 }
661 }
662 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
663 MatchType::DfaSuffix => {
664 match self.find_dfa_reverse_suffix(text, start) {
665 dfa::Result::Match((s, e)) => self.captures_nfa_type(
666 MatchNfaType::Auto,
667 slots,
668 text,
669 s,
670 e,
671 ),
672 dfa::Result::NoMatch(_) => None,
673 dfa::Result::Quit => self.captures_nfa(slots, text, start),
674 }
675 }
676 MatchType::Nfa(ty) => {
677 self.captures_nfa_type(ty, slots, text, start, text.len())
678 }
679 MatchType::Nothing => None,
680 #[cfg(feature = "perf-dfa")]
681 MatchType::DfaMany => {
682 unreachable!("BUG: RegexSet cannot be used with captures")
683 }
684 }
685 }
686 }
687
688 impl<'c> ExecNoSync<'c> {
689 /// Finds the leftmost-first match using only literal search.
690 #[cfg(feature = "perf-literal")]
691 #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>692 fn find_literals(
693 &self,
694 ty: MatchLiteralType,
695 text: &[u8],
696 start: usize,
697 ) -> Option<(usize, usize)> {
698 use self::MatchLiteralType::*;
699 match ty {
700 Unanchored => {
701 let lits = &self.ro.nfa.prefixes;
702 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
703 }
704 AnchoredStart => {
705 let lits = &self.ro.nfa.prefixes;
706 if start == 0 || !self.ro.nfa.is_anchored_start {
707 lits.find_start(&text[start..])
708 .map(|(s, e)| (start + s, start + e))
709 } else {
710 None
711 }
712 }
713 AnchoredEnd => {
714 let lits = &self.ro.suffixes;
715 lits.find_end(&text[start..])
716 .map(|(s, e)| (start + s, start + e))
717 }
718 AhoCorasick => self
719 .ro
720 .ac
721 .as_ref()
722 .unwrap()
723 .find(&text[start..])
724 .map(|m| (start + m.start(), start + m.end())),
725 }
726 }
727
728 /// Finds the leftmost-first match (start and end) using only the DFA.
729 ///
730 /// If the result returned indicates that the DFA quit, then another
731 /// matching engine should be used.
732 #[cfg(feature = "perf-dfa")]
733 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>734 fn find_dfa_forward(
735 &self,
736 text: &[u8],
737 start: usize,
738 ) -> dfa::Result<(usize, usize)> {
739 use crate::dfa::Result::*;
740 let end = match dfa::Fsm::forward(
741 &self.ro.dfa,
742 self.cache.value(),
743 false,
744 text,
745 start,
746 ) {
747 NoMatch(i) => return NoMatch(i),
748 Quit => return Quit,
749 Match(end) if start == end => return Match((start, start)),
750 Match(end) => end,
751 };
752 // Now run the DFA in reverse to find the start of the match.
753 match dfa::Fsm::reverse(
754 &self.ro.dfa_reverse,
755 self.cache.value(),
756 false,
757 &text[start..],
758 end - start,
759 ) {
760 Match(s) => Match((start + s, end)),
761 NoMatch(i) => NoMatch(i),
762 Quit => Quit,
763 }
764 }
765
766 /// Finds the leftmost-first match (start and end) using only the DFA,
767 /// but assumes the regex is anchored at the end and therefore starts at
768 /// the end of the regex and matches in reverse.
769 ///
770 /// If the result returned indicates that the DFA quit, then another
771 /// matching engine should be used.
772 #[cfg(feature = "perf-dfa")]
773 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>774 fn find_dfa_anchored_reverse(
775 &self,
776 text: &[u8],
777 start: usize,
778 ) -> dfa::Result<(usize, usize)> {
779 use crate::dfa::Result::*;
780 match dfa::Fsm::reverse(
781 &self.ro.dfa_reverse,
782 self.cache.value(),
783 false,
784 &text[start..],
785 text.len() - start,
786 ) {
787 Match(s) => Match((start + s, text.len())),
788 NoMatch(i) => NoMatch(i),
789 Quit => Quit,
790 }
791 }
792
793 /// Finds the end of the shortest match using only the DFA.
794 #[cfg(feature = "perf-dfa")]
795 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>796 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
797 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
798 }
799
800 /// Finds the end of the shortest match using only the DFA by scanning for
801 /// suffix literals.
802 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
803 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>804 fn shortest_dfa_reverse_suffix(
805 &self,
806 text: &[u8],
807 start: usize,
808 ) -> dfa::Result<usize> {
809 match self.exec_dfa_reverse_suffix(text, start) {
810 None => self.shortest_dfa(text, start),
811 Some(r) => r.map(|(_, end)| end),
812 }
813 }
814
815 /// Finds the end of the shortest match using only the DFA by scanning for
816 /// suffix literals. It also reports the start of the match.
817 ///
818 /// Note that if None is returned, then the optimization gave up to avoid
819 /// worst case quadratic behavior. A forward scanning DFA should be tried
820 /// next.
821 ///
822 /// If a match is returned and the full leftmost-first match is desired,
823 /// then a forward scan starting from the beginning of the match must be
824 /// done.
825 ///
826 /// If the result returned indicates that the DFA quit, then another
827 /// matching engine should be used.
828 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
829 #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>830 fn exec_dfa_reverse_suffix(
831 &self,
832 text: &[u8],
833 original_start: usize,
834 ) -> Option<dfa::Result<(usize, usize)>> {
835 use crate::dfa::Result::*;
836
837 let lcs = self.ro.suffixes.lcs();
838 debug_assert!(lcs.len() >= 1);
839 let mut start = original_start;
840 let mut end = start;
841 let mut last_literal = start;
842 while end <= text.len() {
843 last_literal += match lcs.find(&text[last_literal..]) {
844 None => return Some(NoMatch(text.len())),
845 Some(i) => i,
846 };
847 end = last_literal + lcs.len();
848 match dfa::Fsm::reverse(
849 &self.ro.dfa_reverse,
850 self.cache.value(),
851 false,
852 &text[start..end],
853 end - start,
854 ) {
855 Match(0) | NoMatch(0) => return None,
856 Match(i) => return Some(Match((start + i, end))),
857 NoMatch(i) => {
858 start += i;
859 last_literal += 1;
860 continue;
861 }
862 Quit => return Some(Quit),
863 };
864 }
865 Some(NoMatch(text.len()))
866 }
867
868 /// Finds the leftmost-first match (start and end) using only the DFA
869 /// by scanning for suffix literals.
870 ///
871 /// If the result returned indicates that the DFA quit, then another
872 /// matching engine should be used.
873 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
874 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>875 fn find_dfa_reverse_suffix(
876 &self,
877 text: &[u8],
878 start: usize,
879 ) -> dfa::Result<(usize, usize)> {
880 use crate::dfa::Result::*;
881
882 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
883 None => return self.find_dfa_forward(text, start),
884 Some(Match((start, _))) => start,
885 Some(r) => return r,
886 };
887 // At this point, we've found a match. The only way to quit now
888 // without a match is if the DFA gives up (seems unlikely).
889 //
890 // Now run the DFA forwards to find the proper end of the match.
891 // (The suffix literal match can only indicate the earliest
892 // possible end location, which may appear before the end of the
893 // leftmost-first match.)
894 match dfa::Fsm::forward(
895 &self.ro.dfa,
896 self.cache.value(),
897 false,
898 text,
899 match_start,
900 ) {
901 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
902 Quit => Quit,
903 Match(e) => Match((match_start, e)),
904 }
905 }
906
907 /// Executes the NFA engine to return whether there is a match or not.
908 ///
909 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
910 /// performance characteristics, but regex sets don't have captures, which
911 /// shortest_nfa depends on.
912 #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool913 fn match_nfa(&self, text: &[u8], start: usize) -> bool {
914 self.match_nfa_type(MatchNfaType::Auto, text, start)
915 }
916
917 /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool918 fn match_nfa_type(
919 &self,
920 ty: MatchNfaType,
921 text: &[u8],
922 start: usize,
923 ) -> bool {
924 self.exec_nfa(
925 ty,
926 &mut [false],
927 &mut [],
928 true,
929 false,
930 text,
931 start,
932 text.len(),
933 )
934 }
935
936 /// Finds the shortest match using an NFA.
937 #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>938 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
939 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
940 }
941
942 /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>943 fn shortest_nfa_type(
944 &self,
945 ty: MatchNfaType,
946 text: &[u8],
947 start: usize,
948 ) -> Option<usize> {
949 let mut slots = [None, None];
950 if self.exec_nfa(
951 ty,
952 &mut [false],
953 &mut slots,
954 true,
955 true,
956 text,
957 start,
958 text.len(),
959 ) {
960 slots[1]
961 } else {
962 None
963 }
964 }
965
966 /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>967 fn find_nfa(
968 &self,
969 ty: MatchNfaType,
970 text: &[u8],
971 start: usize,
972 ) -> Option<(usize, usize)> {
973 let mut slots = [None, None];
974 if self.exec_nfa(
975 ty,
976 &mut [false],
977 &mut slots,
978 false,
979 false,
980 text,
981 start,
982 text.len(),
983 ) {
984 match (slots[0], slots[1]) {
985 (Some(s), Some(e)) => Some((s, e)),
986 _ => None,
987 }
988 } else {
989 None
990 }
991 }
992
993 /// Like find_nfa, but fills in captures.
994 ///
995 /// `slots` should have length equal to `2 * nfa.captures.len()`.
996 #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>997 fn captures_nfa(
998 &self,
999 slots: &mut [Slot],
1000 text: &[u8],
1001 start: usize,
1002 ) -> Option<(usize, usize)> {
1003 self.captures_nfa_type(
1004 MatchNfaType::Auto,
1005 slots,
1006 text,
1007 start,
1008 text.len(),
1009 )
1010 }
1011
1012 /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1013 fn captures_nfa_type(
1014 &self,
1015 ty: MatchNfaType,
1016 slots: &mut [Slot],
1017 text: &[u8],
1018 start: usize,
1019 end: usize,
1020 ) -> Option<(usize, usize)> {
1021 if self.exec_nfa(
1022 ty,
1023 &mut [false],
1024 slots,
1025 false,
1026 false,
1027 text,
1028 start,
1029 end,
1030 ) {
1031 match (slots[0], slots[1]) {
1032 (Some(s), Some(e)) => Some((s, e)),
1033 _ => None,
1034 }
1035 } else {
1036 None
1037 }
1038 }
1039
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1040 fn exec_nfa(
1041 &self,
1042 mut ty: MatchNfaType,
1043 matches: &mut [bool],
1044 slots: &mut [Slot],
1045 quit_after_match: bool,
1046 quit_after_match_with_pos: bool,
1047 text: &[u8],
1048 start: usize,
1049 end: usize,
1050 ) -> bool {
1051 use self::MatchNfaType::*;
1052 if let Auto = ty {
1053 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1054 ty = Backtrack;
1055 } else {
1056 ty = PikeVM;
1057 }
1058 }
1059 // The backtracker can't return the shortest match position as it is
1060 // implemented today. So if someone calls `shortest_match` and we need
1061 // to run an NFA, then use the PikeVM.
1062 if quit_after_match_with_pos || ty == PikeVM {
1063 self.exec_pikevm(
1064 matches,
1065 slots,
1066 quit_after_match,
1067 text,
1068 start,
1069 end,
1070 )
1071 } else {
1072 self.exec_backtrack(matches, slots, text, start, end)
1073 }
1074 }
1075
1076 /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1077 fn exec_pikevm(
1078 &self,
1079 matches: &mut [bool],
1080 slots: &mut [Slot],
1081 quit_after_match: bool,
1082 text: &[u8],
1083 start: usize,
1084 end: usize,
1085 ) -> bool {
1086 if self.ro.nfa.uses_bytes() {
1087 pikevm::Fsm::exec(
1088 &self.ro.nfa,
1089 self.cache.value(),
1090 matches,
1091 slots,
1092 quit_after_match,
1093 ByteInput::new(text, self.ro.nfa.only_utf8),
1094 start,
1095 end,
1096 )
1097 } else {
1098 pikevm::Fsm::exec(
1099 &self.ro.nfa,
1100 self.cache.value(),
1101 matches,
1102 slots,
1103 quit_after_match,
1104 CharInput::new(text),
1105 start,
1106 end,
1107 )
1108 }
1109 }
1110
1111 /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1112 fn exec_backtrack(
1113 &self,
1114 matches: &mut [bool],
1115 slots: &mut [Slot],
1116 text: &[u8],
1117 start: usize,
1118 end: usize,
1119 ) -> bool {
1120 if self.ro.nfa.uses_bytes() {
1121 backtrack::Bounded::exec(
1122 &self.ro.nfa,
1123 self.cache.value(),
1124 matches,
1125 slots,
1126 ByteInput::new(text, self.ro.nfa.only_utf8),
1127 start,
1128 end,
1129 )
1130 } else {
1131 backtrack::Bounded::exec(
1132 &self.ro.nfa,
1133 self.cache.value(),
1134 matches,
1135 slots,
1136 CharInput::new(text),
1137 start,
1138 end,
1139 )
1140 }
1141 }
1142
1143 /// Finds which regular expressions match the given text.
1144 ///
1145 /// `matches` should have length equal to the number of regexes being
1146 /// searched.
1147 ///
1148 /// This is only useful when one wants to know which regexes in a set
1149 /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1150 pub fn many_matches_at(
1151 &self,
1152 matches: &mut [bool],
1153 text: &[u8],
1154 start: usize,
1155 ) -> bool {
1156 use self::MatchType::*;
1157 if !self.is_anchor_end_match(text) {
1158 return false;
1159 }
1160 match self.ro.match_type {
1161 #[cfg(feature = "perf-literal")]
1162 Literal(ty) => {
1163 debug_assert_eq!(matches.len(), 1);
1164 matches[0] = self.find_literals(ty, text, start).is_some();
1165 matches[0]
1166 }
1167 #[cfg(feature = "perf-dfa")]
1168 Dfa | DfaAnchoredReverse | DfaMany => {
1169 match dfa::Fsm::forward_many(
1170 &self.ro.dfa,
1171 self.cache.value(),
1172 matches,
1173 text,
1174 start,
1175 ) {
1176 dfa::Result::Match(_) => true,
1177 dfa::Result::NoMatch(_) => false,
1178 dfa::Result::Quit => self.exec_nfa(
1179 MatchNfaType::Auto,
1180 matches,
1181 &mut [],
1182 false,
1183 false,
1184 text,
1185 start,
1186 text.len(),
1187 ),
1188 }
1189 }
1190 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1191 DfaSuffix => {
1192 match dfa::Fsm::forward_many(
1193 &self.ro.dfa,
1194 self.cache.value(),
1195 matches,
1196 text,
1197 start,
1198 ) {
1199 dfa::Result::Match(_) => true,
1200 dfa::Result::NoMatch(_) => false,
1201 dfa::Result::Quit => self.exec_nfa(
1202 MatchNfaType::Auto,
1203 matches,
1204 &mut [],
1205 false,
1206 false,
1207 text,
1208 start,
1209 text.len(),
1210 ),
1211 }
1212 }
1213 Nfa(ty) => self.exec_nfa(
1214 ty,
1215 matches,
1216 &mut [],
1217 false,
1218 false,
1219 text,
1220 start,
1221 text.len(),
1222 ),
1223 Nothing => false,
1224 }
1225 }
1226
1227 #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1228 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1229 #[cfg(not(feature = "perf-literal"))]
1230 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1231 true
1232 }
1233
1234 #[cfg(feature = "perf-literal")]
1235 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1236 // Only do this check if the haystack is big (>1MB).
1237 if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1238 let lcs = ro.suffixes.lcs();
1239 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1240 return false;
1241 }
1242 }
1243 true
1244 }
1245
1246 imp(&self.ro, text)
1247 }
1248
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1249 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1250 &self.ro.nfa.capture_name_idx
1251 }
1252 }
1253
1254 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1255 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1256 self.0.capture_name_idx()
1257 }
1258 }
1259
1260 impl Exec {
1261 /// Get a searcher that isn't Sync.
1262 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync<'_>1263 pub fn searcher(&self) -> ExecNoSync<'_> {
1264 ExecNoSync {
1265 ro: &self.ro, // a clone is too expensive here! (and not needed)
1266 cache: self.pool.get(),
1267 }
1268 }
1269
1270 /// Get a searcher that isn't Sync and can match on &str.
1271 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr<'_>1272 pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
1273 ExecNoSyncStr(self.searcher())
1274 }
1275
1276 /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1277 pub fn into_regex(self) -> re_unicode::Regex {
1278 re_unicode::Regex::from(self)
1279 }
1280
1281 /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1282 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1283 re_set::unicode::RegexSet::from(self)
1284 }
1285
1286 /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1287 pub fn into_byte_regex(self) -> re_bytes::Regex {
1288 re_bytes::Regex::from(self)
1289 }
1290
1291 /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1292 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1293 re_set::bytes::RegexSet::from(self)
1294 }
1295
1296 /// The original regular expressions given by the caller that were
1297 /// compiled.
regex_strings(&self) -> &[String]1298 pub fn regex_strings(&self) -> &[String] {
1299 &self.ro.res
1300 }
1301
1302 /// Return a slice of capture names.
1303 ///
1304 /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1305 pub fn capture_names(&self) -> &[Option<String>] {
1306 &self.ro.nfa.captures
1307 }
1308
1309 /// Return a reference to named groups mapping (from group name to
1310 /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1311 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1312 &self.ro.nfa.capture_name_idx
1313 }
1314 }
1315
1316 impl Clone for Exec {
clone(&self) -> Exec1317 fn clone(&self) -> Exec {
1318 let pool = ExecReadOnly::new_pool(&self.ro);
1319 Exec { ro: self.ro.clone(), pool }
1320 }
1321 }
1322
1323 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1324 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1325 if let Some(MatchType::Nfa(_)) = hint {
1326 return hint.unwrap();
1327 }
1328 // If the NFA is empty, then we'll never match anything.
1329 if self.nfa.insts.is_empty() {
1330 return MatchType::Nothing;
1331 }
1332 if let Some(literalty) = self.choose_literal_match_type() {
1333 return literalty;
1334 }
1335 if let Some(dfaty) = self.choose_dfa_match_type() {
1336 return dfaty;
1337 }
1338 // We're so totally hosed.
1339 MatchType::Nfa(MatchNfaType::Auto)
1340 }
1341
1342 /// If a plain literal scan can be used, then a corresponding literal
1343 /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1344 fn choose_literal_match_type(&self) -> Option<MatchType> {
1345 #[cfg(not(feature = "perf-literal"))]
1346 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1347 None
1348 }
1349
1350 #[cfg(feature = "perf-literal")]
1351 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1352 // If our set of prefixes is complete, then we can use it to find
1353 // a match in lieu of a regex engine. This doesn't quite work well
1354 // in the presence of multiple regexes, so only do it when there's
1355 // one.
1356 //
1357 // TODO(burntsushi): Also, don't try to match literals if the regex
1358 // is partially anchored. We could technically do it, but we'd need
1359 // to create two sets of literals: all of them and then the subset
1360 // that aren't anchored. We would then only search for all of them
1361 // when at the beginning of the input and use the subset in all
1362 // other cases.
1363 if ro.res.len() != 1 {
1364 return None;
1365 }
1366 if ro.ac.is_some() {
1367 return Some(MatchType::Literal(
1368 MatchLiteralType::AhoCorasick,
1369 ));
1370 }
1371 if ro.nfa.prefixes.complete() {
1372 return if ro.nfa.is_anchored_start {
1373 Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1374 } else {
1375 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1376 };
1377 }
1378 if ro.suffixes.complete() {
1379 return if ro.nfa.is_anchored_end {
1380 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1381 } else {
1382 // This case shouldn't happen. When the regex isn't
1383 // anchored, then complete prefixes should imply complete
1384 // suffixes.
1385 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1386 };
1387 }
1388 None
1389 }
1390
1391 imp(self)
1392 }
1393
1394 /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1395 fn choose_dfa_match_type(&self) -> Option<MatchType> {
1396 #[cfg(not(feature = "perf-dfa"))]
1397 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1398 None
1399 }
1400
1401 #[cfg(feature = "perf-dfa")]
1402 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1403 if !dfa::can_exec(&ro.dfa) {
1404 return None;
1405 }
1406 // Regex sets require a slightly specialized path.
1407 if ro.res.len() >= 2 {
1408 return Some(MatchType::DfaMany);
1409 }
1410 // If the regex is anchored at the end but not the start, then
1411 // just match in reverse from the end of the haystack.
1412 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1413 return Some(MatchType::DfaAnchoredReverse);
1414 }
1415 #[cfg(feature = "perf-literal")]
1416 {
1417 // If there's a longish suffix literal, then it might be faster
1418 // to look for that first.
1419 if ro.should_suffix_scan() {
1420 return Some(MatchType::DfaSuffix);
1421 }
1422 }
1423 // Fall back to your garden variety forward searching lazy DFA.
1424 Some(MatchType::Dfa)
1425 }
1426
1427 imp(self)
1428 }
1429
1430 /// Returns true if the program is amenable to suffix scanning.
1431 ///
1432 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1433 /// for suffix literals and then do a *reverse* DFA match from any matches
1434 /// produced by the literal scan. (And then followed by a forward DFA
1435 /// search, since the previously found suffix literal maybe not actually be
1436 /// the end of a match.)
1437 ///
1438 /// This is a bit of a specialized optimization, but can result in pretty
1439 /// big performance wins if 1) there are no prefix literals and 2) the
1440 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1441 /// account for but (2) is harder. As a proxy, we assume that longer
1442 /// strings are generally rarer, so we only enable this optimization when
1443 /// we have a meaty suffix.
1444 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1445 fn should_suffix_scan(&self) -> bool {
1446 if self.suffixes.is_empty() {
1447 return false;
1448 }
1449 let lcs_len = self.suffixes.lcs().char_len();
1450 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1451 }
1452
new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>>1453 fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1454 let ro = ro.clone();
1455 Box::new(Pool::new(Box::new(move || {
1456 AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1457 })))
1458 }
1459 }
1460
1461 #[derive(Clone, Copy, Debug)]
1462 enum MatchType {
1463 /// A single or multiple literal search. This is only used when the regex
1464 /// can be decomposed into a literal search.
1465 #[cfg(feature = "perf-literal")]
1466 Literal(MatchLiteralType),
1467 /// A normal DFA search.
1468 #[cfg(feature = "perf-dfa")]
1469 Dfa,
1470 /// A reverse DFA search starting from the end of a haystack.
1471 #[cfg(feature = "perf-dfa")]
1472 DfaAnchoredReverse,
1473 /// A reverse DFA search with suffix literal scanning.
1474 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1475 DfaSuffix,
1476 /// Use the DFA on two or more regular expressions.
1477 #[cfg(feature = "perf-dfa")]
1478 DfaMany,
1479 /// An NFA variant.
1480 Nfa(MatchNfaType),
1481 /// No match is ever possible, so don't ever try to search.
1482 Nothing,
1483 }
1484
1485 #[derive(Clone, Copy, Debug)]
1486 #[cfg(feature = "perf-literal")]
1487 enum MatchLiteralType {
1488 /// Match literals anywhere in text.
1489 Unanchored,
1490 /// Match literals only at the start of text.
1491 AnchoredStart,
1492 /// Match literals only at the end of text.
1493 AnchoredEnd,
1494 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1495 /// ExecReadOnly.
1496 AhoCorasick,
1497 }
1498
1499 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1500 enum MatchNfaType {
1501 /// Choose between Backtrack and PikeVM.
1502 Auto,
1503 /// NFA bounded backtracking.
1504 ///
1505 /// (This is only set by tests, since it never makes sense to always want
1506 /// backtracking.)
1507 Backtrack,
1508 /// The Pike VM.
1509 ///
1510 /// (This is only set by tests, since it never makes sense to always want
1511 /// the Pike VM.)
1512 PikeVM,
1513 }
1514
1515 /// `ProgramCache` maintains reusable allocations for each matching engine
1516 /// available to a particular program.
1517 ///
1518 /// We declare this as unwind safe since it's a cache that's only used for
1519 /// performance purposes. If a panic occurs, it is (or should be) always safe
1520 /// to continue using the same regex object.
1521 pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1522
1523 #[derive(Debug)]
1524 pub struct ProgramCacheInner {
1525 pub pikevm: pikevm::Cache,
1526 pub backtrack: backtrack::Cache,
1527 #[cfg(feature = "perf-dfa")]
1528 pub dfa: dfa::Cache,
1529 #[cfg(feature = "perf-dfa")]
1530 pub dfa_reverse: dfa::Cache,
1531 }
1532
1533 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1534 fn new(ro: &ExecReadOnly) -> Self {
1535 ProgramCacheInner {
1536 pikevm: pikevm::Cache::new(&ro.nfa),
1537 backtrack: backtrack::Cache::new(&ro.nfa),
1538 #[cfg(feature = "perf-dfa")]
1539 dfa: dfa::Cache::new(&ro.dfa),
1540 #[cfg(feature = "perf-dfa")]
1541 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1542 }
1543 }
1544 }
1545
1546 /// Alternation literals checks if the given HIR is a simple alternation of
1547 /// literals, and if so, returns them. Otherwise, this returns None.
1548 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1549 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1550 use regex_syntax::hir::{HirKind, Literal};
1551
1552 // This is pretty hacky, but basically, if `is_alternation_literal` is
1553 // true, then we can make several assumptions about the structure of our
1554 // HIR. This is what justifies the `unreachable!` statements below.
1555 //
1556 // This code should be refactored once we overhaul this crate's
1557 // optimization pipeline, because this is a terribly inflexible way to go
1558 // about things.
1559
1560 if !expr.is_alternation_literal() {
1561 return None;
1562 }
1563 let alts = match *expr.kind() {
1564 HirKind::Alternation(ref alts) => alts,
1565 _ => return None, // one literal isn't worth it
1566 };
1567
1568 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1569 Literal::Unicode(c) => {
1570 let mut buf = [0; 4];
1571 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1572 }
1573 Literal::Byte(b) => {
1574 dst.push(b);
1575 }
1576 };
1577
1578 let mut lits = vec![];
1579 for alt in alts {
1580 let mut lit = vec![];
1581 match *alt.kind() {
1582 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1583 HirKind::Concat(ref exprs) => {
1584 for e in exprs {
1585 match *e.kind() {
1586 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1587 _ => unreachable!("expected literal, got {:?}", e),
1588 }
1589 }
1590 }
1591 _ => unreachable!("expected literal or concat, got {:?}", alt),
1592 }
1593 lits.push(lit);
1594 }
1595 Some(lits)
1596 }
1597
1598 #[cfg(test)]
1599 mod test {
1600 #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1601 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1602 use crate::internal::ExecBuilder;
1603
1604 let backtrack_bytes_re = ExecBuilder::new("^S")
1605 .bounded_backtracking()
1606 .only_utf8(false)
1607 .build()
1608 .map(|exec| exec.into_byte_regex())
1609 .map_err(|err| format!("{}", err))
1610 .unwrap();
1611
1612 let default_bytes_re = ExecBuilder::new("^S")
1613 .only_utf8(false)
1614 .build()
1615 .map(|exec| exec.into_byte_regex())
1616 .map_err(|err| format!("{}", err))
1617 .unwrap();
1618
1619 let input = vec![83, 83];
1620
1621 let s1 = backtrack_bytes_re.split(&input);
1622 let s2 = default_bytes_re.split(&input);
1623 for (chunk1, chunk2) in s1.zip(s2) {
1624 assert_eq!(chunk1, chunk2);
1625 }
1626 }
1627
1628 #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1629 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1630 use crate::internal::ExecBuilder;
1631
1632 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1633 .bounded_backtracking()
1634 .bytes(true)
1635 .build()
1636 .map(|exec| exec.into_regex())
1637 .map_err(|err| format!("{}", err))
1638 .unwrap();
1639
1640 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1641 .bytes(true)
1642 .build()
1643 .map(|exec| exec.into_regex())
1644 .map_err(|err| format!("{}", err))
1645 .unwrap();
1646
1647 let input = "**";
1648
1649 let s1 = backtrack_bytes_re.split(input);
1650 let s2 = default_bytes_re.split(input);
1651 for (chunk1, chunk2) in s1.zip(s2) {
1652 assert_eq!(chunk1, chunk2);
1653 }
1654 }
1655 }
1656