• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::panic::AssertUnwindSafe;
4 use std::sync::Arc;
5 
6 #[cfg(feature = "perf-literal")]
7 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
8 use syntax::hir::literal::Literals;
9 use syntax::hir::Hir;
10 use syntax::ParserBuilder;
11 
12 use backtrack;
13 use compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use dfa;
16 use error::Error;
17 use input::{ByteInput, CharInput};
18 use literal::LiteralSearcher;
19 use pikevm;
20 use pool::{Pool, PoolGuard};
21 use prog::Program;
22 use re_builder::RegexOptions;
23 use re_bytes;
24 use re_set;
25 use re_trait::{Locations, RegularExpression, Slot};
26 use re_unicode;
27 use utf8::next_utf8;
28 
29 /// `Exec` manages the execution of a regular expression.
30 ///
31 /// In particular, this manages the various compiled forms of a single regular
32 /// expression and the choice of which matching engine to use to execute a
33 /// regular expression.
34 #[derive(Debug)]
35 pub struct Exec {
36     /// All read only state.
37     ro: Arc<ExecReadOnly>,
38     /// A pool of reusable values for the various matching engines.
39     ///
40     /// Note that boxing this value is not strictly necessary, but it is an
41     /// easy way to ensure that T does not bloat the stack sized used by a pool
42     /// in the case where T is big. And this turns out to be the case at the
43     /// time of writing for regex's use of this pool. At the time of writing,
44     /// the size of a Regex on the stack is 856 bytes. Boxing this value
45     /// reduces that size to 16 bytes.
46     pool: Box<Pool<ProgramCache>>,
47 }
48 
49 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50 /// means it is no longer Sync, but we can now avoid the overhead of
51 /// synchronization to fetch the cache.
52 #[derive(Debug)]
53 pub struct ExecNoSync<'c> {
54     /// All read only state.
55     ro: &'c Arc<ExecReadOnly>,
56     /// Caches for the various matching engines.
57     cache: PoolGuard<'c, ProgramCache>,
58 }
59 
60 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61 #[derive(Debug)]
62 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63 
64 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65 /// state is determined at compile time and never changes during search.
66 #[derive(Debug)]
67 struct ExecReadOnly {
68     /// The original regular expressions given by the caller to compile.
69     res: Vec<String>,
70     /// A compiled program that is used in the NFA simulation and backtracking.
71     /// It can be byte-based or Unicode codepoint based.
72     ///
73     /// N.B. It is not possibly to make this byte-based from the public API.
74     /// It is only used for testing byte based programs in the NFA simulations.
75     nfa: Program,
76     /// A compiled byte based program for DFA execution. This is only used
77     /// if a DFA can be executed. (Currently, only word boundary assertions are
78     /// not supported.) Note that this program contains an embedded `.*?`
79     /// preceding the first capture group, unless the regex is anchored at the
80     /// beginning.
81     dfa: Program,
82     /// The same as above, except the program is reversed (and there is no
83     /// preceding `.*?`). This is used by the DFA to find the starting location
84     /// of matches.
85     dfa_reverse: Program,
86     /// A set of suffix literals extracted from the regex.
87     ///
88     /// Prefix literals are stored on the `Program`, since they are used inside
89     /// the matching engines.
90     suffixes: LiteralSearcher,
91     /// An Aho-Corasick automaton with leftmost-first match semantics.
92     ///
93     /// This is only set when the entire regex is a simple unanchored
94     /// alternation of literals. We could probably use it more circumstances,
95     /// but this is already hacky enough in this architecture.
96     ///
97     /// N.B. We use u32 as a state ID representation under the assumption that
98     /// if we were to exhaust the ID space, we probably would have long
99     /// surpassed the compilation size limit.
100     #[cfg(feature = "perf-literal")]
101     ac: Option<AhoCorasick<u32>>,
102     /// match_type encodes as much upfront knowledge about how we're going to
103     /// execute a search as possible.
104     match_type: MatchType,
105 }
106 
107 /// Facilitates the construction of an executor by exposing various knobs
108 /// to control how a regex is executed and what kinds of resources it's
109 /// permitted to use.
110 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
111 // `Debug`.
112 #[allow(missing_debug_implementations)]
113 pub struct ExecBuilder {
114     options: RegexOptions,
115     match_type: Option<MatchType>,
116     bytes: bool,
117     only_utf8: bool,
118 }
119 
120 /// Parsed represents a set of parsed regular expressions and their detected
121 /// literals.
122 struct Parsed {
123     exprs: Vec<Hir>,
124     prefixes: Literals,
125     suffixes: Literals,
126     bytes: bool,
127 }
128 
129 impl ExecBuilder {
130     /// Create a regex execution builder.
131     ///
132     /// This uses default settings for everything except the regex itself,
133     /// which must be provided. Further knobs can be set by calling methods,
134     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self135     pub fn new(re: &str) -> Self {
136         Self::new_many(&[re])
137     }
138 
139     /// Like new, but compiles the union of the given regular expressions.
140     ///
141     /// Note that when compiling 2 or more regular expressions, capture groups
142     /// are completely unsupported. (This means both `find` and `captures`
143     /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,144     pub fn new_many<I, S>(res: I) -> Self
145     where
146         S: AsRef<str>,
147         I: IntoIterator<Item = S>,
148     {
149         let mut opts = RegexOptions::default();
150         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
151         Self::new_options(opts)
152     }
153 
154     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self155     pub fn new_options(opts: RegexOptions) -> Self {
156         ExecBuilder {
157             options: opts,
158             match_type: None,
159             bytes: false,
160             only_utf8: true,
161         }
162     }
163 
164     /// Set the matching engine to be automatically determined.
165     ///
166     /// This is the default state and will apply whatever optimizations are
167     /// possible, such as running a DFA.
168     ///
169     /// This overrides whatever was previously set via the `nfa` or
170     /// `bounded_backtracking` methods.
automatic(mut self) -> Self171     pub fn automatic(mut self) -> Self {
172         self.match_type = None;
173         self
174     }
175 
176     /// Sets the matching engine to use the NFA algorithm no matter what
177     /// optimizations are possible.
178     ///
179     /// This overrides whatever was previously set via the `automatic` or
180     /// `bounded_backtracking` methods.
nfa(mut self) -> Self181     pub fn nfa(mut self) -> Self {
182         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
183         self
184     }
185 
186     /// Sets the matching engine to use a bounded backtracking engine no
187     /// matter what optimizations are possible.
188     ///
189     /// One must use this with care, since the bounded backtracking engine
190     /// uses memory proportion to `len(regex) * len(text)`.
191     ///
192     /// This overrides whatever was previously set via the `automatic` or
193     /// `nfa` methods.
bounded_backtracking(mut self) -> Self194     pub fn bounded_backtracking(mut self) -> Self {
195         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
196         self
197     }
198 
199     /// Compiles byte based programs for use with the NFA matching engines.
200     ///
201     /// By default, the NFA engines match on Unicode scalar values. They can
202     /// be made to use byte based programs instead. In general, the byte based
203     /// programs are slower because of a less efficient encoding of character
204     /// classes.
205     ///
206     /// Note that this does not impact DFA matching engines, which always
207     /// execute on bytes.
bytes(mut self, yes: bool) -> Self208     pub fn bytes(mut self, yes: bool) -> Self {
209         self.bytes = yes;
210         self
211     }
212 
213     /// When disabled, the program compiled may match arbitrary bytes.
214     ///
215     /// When enabled (the default), all compiled programs exclusively match
216     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self217     pub fn only_utf8(mut self, yes: bool) -> Self {
218         self.only_utf8 = yes;
219         self
220     }
221 
222     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self223     pub fn unicode(mut self, yes: bool) -> Self {
224         self.options.unicode = yes;
225         self
226     }
227 
228     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>229     fn parse(&self) -> Result<Parsed, Error> {
230         let mut exprs = Vec::with_capacity(self.options.pats.len());
231         let mut prefixes = Some(Literals::empty());
232         let mut suffixes = Some(Literals::empty());
233         let mut bytes = false;
234         let is_set = self.options.pats.len() > 1;
235         // If we're compiling a regex set and that set has any anchored
236         // expressions, then disable all literal optimizations.
237         for pat in &self.options.pats {
238             let mut parser = ParserBuilder::new()
239                 .octal(self.options.octal)
240                 .case_insensitive(self.options.case_insensitive)
241                 .multi_line(self.options.multi_line)
242                 .dot_matches_new_line(self.options.dot_matches_new_line)
243                 .swap_greed(self.options.swap_greed)
244                 .ignore_whitespace(self.options.ignore_whitespace)
245                 .unicode(self.options.unicode)
246                 .allow_invalid_utf8(!self.only_utf8)
247                 .nest_limit(self.options.nest_limit)
248                 .build();
249             let expr =
250                 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
251             bytes = bytes || !expr.is_always_utf8();
252 
253             if cfg!(feature = "perf-literal") {
254                 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
255                     // Partial anchors unfortunately make it hard to use
256                     // prefixes, so disable them.
257                     prefixes = None;
258                 } else if is_set && expr.is_anchored_start() {
259                     // Regex sets with anchors do not go well with literal
260                     // optimizations.
261                     prefixes = None;
262                 }
263                 prefixes = prefixes.and_then(|mut prefixes| {
264                     if !prefixes.union_prefixes(&expr) {
265                         None
266                     } else {
267                         Some(prefixes)
268                     }
269                 });
270 
271                 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
272                     // Partial anchors unfortunately make it hard to use
273                     // suffixes, so disable them.
274                     suffixes = None;
275                 } else if is_set && expr.is_anchored_end() {
276                     // Regex sets with anchors do not go well with literal
277                     // optimizations.
278                     suffixes = None;
279                 }
280                 suffixes = suffixes.and_then(|mut suffixes| {
281                     if !suffixes.union_suffixes(&expr) {
282                         None
283                     } else {
284                         Some(suffixes)
285                     }
286                 });
287             }
288             exprs.push(expr);
289         }
290         Ok(Parsed {
291             exprs: exprs,
292             prefixes: prefixes.unwrap_or_else(Literals::empty),
293             suffixes: suffixes.unwrap_or_else(Literals::empty),
294             bytes: bytes,
295         })
296     }
297 
298     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>299     pub fn build(self) -> Result<Exec, Error> {
300         // Special case when we have no patterns to compile.
301         // This can happen when compiling a regex set.
302         if self.options.pats.is_empty() {
303             let ro = Arc::new(ExecReadOnly {
304                 res: vec![],
305                 nfa: Program::new(),
306                 dfa: Program::new(),
307                 dfa_reverse: Program::new(),
308                 suffixes: LiteralSearcher::empty(),
309                 #[cfg(feature = "perf-literal")]
310                 ac: None,
311                 match_type: MatchType::Nothing,
312             });
313             let pool = ExecReadOnly::new_pool(&ro);
314             return Ok(Exec { ro: ro, pool });
315         }
316         let parsed = self.parse()?;
317         let mut nfa = Compiler::new()
318             .size_limit(self.options.size_limit)
319             .bytes(self.bytes || parsed.bytes)
320             .only_utf8(self.only_utf8)
321             .compile(&parsed.exprs)?;
322         let mut dfa = Compiler::new()
323             .size_limit(self.options.size_limit)
324             .dfa(true)
325             .only_utf8(self.only_utf8)
326             .compile(&parsed.exprs)?;
327         let mut dfa_reverse = Compiler::new()
328             .size_limit(self.options.size_limit)
329             .dfa(true)
330             .only_utf8(self.only_utf8)
331             .reverse(true)
332             .compile(&parsed.exprs)?;
333 
334         #[cfg(feature = "perf-literal")]
335         let ac = self.build_aho_corasick(&parsed);
336         nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
337         dfa.prefixes = nfa.prefixes.clone();
338         dfa.dfa_size_limit = self.options.dfa_size_limit;
339         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
340 
341         let mut ro = ExecReadOnly {
342             res: self.options.pats,
343             nfa: nfa,
344             dfa: dfa,
345             dfa_reverse: dfa_reverse,
346             suffixes: LiteralSearcher::suffixes(parsed.suffixes),
347             #[cfg(feature = "perf-literal")]
348             ac: ac,
349             match_type: MatchType::Nothing,
350         };
351         ro.match_type = ro.choose_match_type(self.match_type);
352 
353         let ro = Arc::new(ro);
354         let pool = ExecReadOnly::new_pool(&ro);
355         Ok(Exec { ro, pool })
356     }
357 
358     #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>359     fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
360         if parsed.exprs.len() != 1 {
361             return None;
362         }
363         let lits = match alternation_literals(&parsed.exprs[0]) {
364             None => return None,
365             Some(lits) => lits,
366         };
367         // If we have a small number of literals, then let Teddy handle
368         // things (see literal/mod.rs).
369         if lits.len() <= 32 {
370             return None;
371         }
372         Some(
373             AhoCorasickBuilder::new()
374                 .match_kind(MatchKind::LeftmostFirst)
375                 .auto_configure(&lits)
376                 // We always want this to reduce size, regardless
377                 // of what auto-configure does.
378                 .byte_classes(true)
379                 .build_with_size::<u32, _, _>(&lits)
380                 // This should never happen because we'd long exceed the
381                 // compilation limit for regexes first.
382                 .expect("AC automaton too big"),
383         )
384     }
385 }
386 
387 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
388     type Text = str;
389 
slots_len(&self) -> usize390     fn slots_len(&self) -> usize {
391         self.0.slots_len()
392     }
393 
next_after_empty(&self, text: &str, i: usize) -> usize394     fn next_after_empty(&self, text: &str, i: usize) -> usize {
395         next_utf8(text.as_bytes(), i)
396     }
397 
398     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>399     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
400         self.0.shortest_match_at(text.as_bytes(), start)
401     }
402 
403     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool404     fn is_match_at(&self, text: &str, start: usize) -> bool {
405         self.0.is_match_at(text.as_bytes(), start)
406     }
407 
408     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>409     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
410         self.0.find_at(text.as_bytes(), start)
411     }
412 
413     #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>414     fn captures_read_at(
415         &self,
416         locs: &mut Locations,
417         text: &str,
418         start: usize,
419     ) -> Option<(usize, usize)> {
420         self.0.captures_read_at(locs, text.as_bytes(), start)
421     }
422 }
423 
424 impl<'c> RegularExpression for ExecNoSync<'c> {
425     type Text = [u8];
426 
427     /// Returns the number of capture slots in the regular expression. (There
428     /// are two slots for every capture group, corresponding to possibly empty
429     /// start and end locations of the capture.)
slots_len(&self) -> usize430     fn slots_len(&self) -> usize {
431         self.ro.nfa.captures.len() * 2
432     }
433 
next_after_empty(&self, _text: &[u8], i: usize) -> usize434     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
435         i + 1
436     }
437 
438     /// Returns the end of a match location, possibly occurring before the
439     /// end location of the correct leftmost-first match.
440     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>441     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
442         if !self.is_anchor_end_match(text) {
443             return None;
444         }
445         match self.ro.match_type {
446             #[cfg(feature = "perf-literal")]
447             MatchType::Literal(ty) => {
448                 self.find_literals(ty, text, start).map(|(_, e)| e)
449             }
450             #[cfg(feature = "perf-dfa")]
451             MatchType::Dfa | MatchType::DfaMany => {
452                 match self.shortest_dfa(text, start) {
453                     dfa::Result::Match(end) => Some(end),
454                     dfa::Result::NoMatch(_) => None,
455                     dfa::Result::Quit => self.shortest_nfa(text, start),
456                 }
457             }
458             #[cfg(feature = "perf-dfa")]
459             MatchType::DfaAnchoredReverse => {
460                 match dfa::Fsm::reverse(
461                     &self.ro.dfa_reverse,
462                     self.cache.value(),
463                     true,
464                     &text[start..],
465                     text.len(),
466                 ) {
467                     dfa::Result::Match(_) => Some(text.len()),
468                     dfa::Result::NoMatch(_) => None,
469                     dfa::Result::Quit => self.shortest_nfa(text, start),
470                 }
471             }
472             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
473             MatchType::DfaSuffix => {
474                 match self.shortest_dfa_reverse_suffix(text, start) {
475                     dfa::Result::Match(e) => Some(e),
476                     dfa::Result::NoMatch(_) => None,
477                     dfa::Result::Quit => self.shortest_nfa(text, start),
478                 }
479             }
480             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
481             MatchType::Nothing => None,
482         }
483     }
484 
485     /// Returns true if and only if the regex matches text.
486     ///
487     /// For single regular expressions, this is equivalent to calling
488     /// shortest_match(...).is_some().
489     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool490     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
491         if !self.is_anchor_end_match(text) {
492             return false;
493         }
494         // We need to do this dance because shortest_match relies on the NFA
495         // filling in captures[1], but a RegexSet has no captures. In other
496         // words, a RegexSet can't (currently) use shortest_match. ---AG
497         match self.ro.match_type {
498             #[cfg(feature = "perf-literal")]
499             MatchType::Literal(ty) => {
500                 self.find_literals(ty, text, start).is_some()
501             }
502             #[cfg(feature = "perf-dfa")]
503             MatchType::Dfa | MatchType::DfaMany => {
504                 match self.shortest_dfa(text, start) {
505                     dfa::Result::Match(_) => true,
506                     dfa::Result::NoMatch(_) => false,
507                     dfa::Result::Quit => self.match_nfa(text, start),
508                 }
509             }
510             #[cfg(feature = "perf-dfa")]
511             MatchType::DfaAnchoredReverse => {
512                 match dfa::Fsm::reverse(
513                     &self.ro.dfa_reverse,
514                     self.cache.value(),
515                     true,
516                     &text[start..],
517                     text.len(),
518                 ) {
519                     dfa::Result::Match(_) => true,
520                     dfa::Result::NoMatch(_) => false,
521                     dfa::Result::Quit => self.match_nfa(text, start),
522                 }
523             }
524             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
525             MatchType::DfaSuffix => {
526                 match self.shortest_dfa_reverse_suffix(text, start) {
527                     dfa::Result::Match(_) => true,
528                     dfa::Result::NoMatch(_) => false,
529                     dfa::Result::Quit => self.match_nfa(text, start),
530                 }
531             }
532             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
533             MatchType::Nothing => false,
534         }
535     }
536 
537     /// Finds the start and end location of the leftmost-first match, starting
538     /// at the given location.
539     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>540     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
541         if !self.is_anchor_end_match(text) {
542             return None;
543         }
544         match self.ro.match_type {
545             #[cfg(feature = "perf-literal")]
546             MatchType::Literal(ty) => self.find_literals(ty, text, start),
547             #[cfg(feature = "perf-dfa")]
548             MatchType::Dfa => match self.find_dfa_forward(text, start) {
549                 dfa::Result::Match((s, e)) => Some((s, e)),
550                 dfa::Result::NoMatch(_) => None,
551                 dfa::Result::Quit => {
552                     self.find_nfa(MatchNfaType::Auto, text, start)
553                 }
554             },
555             #[cfg(feature = "perf-dfa")]
556             MatchType::DfaAnchoredReverse => {
557                 match self.find_dfa_anchored_reverse(text, start) {
558                     dfa::Result::Match((s, e)) => Some((s, e)),
559                     dfa::Result::NoMatch(_) => None,
560                     dfa::Result::Quit => {
561                         self.find_nfa(MatchNfaType::Auto, text, start)
562                     }
563                 }
564             }
565             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
566             MatchType::DfaSuffix => {
567                 match self.find_dfa_reverse_suffix(text, start) {
568                     dfa::Result::Match((s, e)) => Some((s, e)),
569                     dfa::Result::NoMatch(_) => None,
570                     dfa::Result::Quit => {
571                         self.find_nfa(MatchNfaType::Auto, text, start)
572                     }
573                 }
574             }
575             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
576             MatchType::Nothing => None,
577             #[cfg(feature = "perf-dfa")]
578             MatchType::DfaMany => {
579                 unreachable!("BUG: RegexSet cannot be used with find")
580             }
581         }
582     }
583 
584     /// Finds the start and end location of the leftmost-first match and also
585     /// fills in all matching capture groups.
586     ///
587     /// The number of capture slots given should be equal to the total number
588     /// of capture slots in the compiled program.
589     ///
590     /// Note that the first two slots always correspond to the start and end
591     /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>592     fn captures_read_at(
593         &self,
594         locs: &mut Locations,
595         text: &[u8],
596         start: usize,
597     ) -> Option<(usize, usize)> {
598         let slots = locs.as_slots();
599         for slot in slots.iter_mut() {
600             *slot = None;
601         }
602         // If the caller unnecessarily uses this, then we try to save them
603         // from themselves.
604         match slots.len() {
605             0 => return self.find_at(text, start),
606             2 => {
607                 return self.find_at(text, start).map(|(s, e)| {
608                     slots[0] = Some(s);
609                     slots[1] = Some(e);
610                     (s, e)
611                 });
612             }
613             _ => {} // fallthrough
614         }
615         if !self.is_anchor_end_match(text) {
616             return None;
617         }
618         match self.ro.match_type {
619             #[cfg(feature = "perf-literal")]
620             MatchType::Literal(ty) => {
621                 self.find_literals(ty, text, start).and_then(|(s, e)| {
622                     self.captures_nfa_type(
623                         MatchNfaType::Auto,
624                         slots,
625                         text,
626                         s,
627                         e,
628                     )
629                 })
630             }
631             #[cfg(feature = "perf-dfa")]
632             MatchType::Dfa => {
633                 if self.ro.nfa.is_anchored_start {
634                     self.captures_nfa(slots, text, start)
635                 } else {
636                     match self.find_dfa_forward(text, start) {
637                         dfa::Result::Match((s, e)) => self.captures_nfa_type(
638                             MatchNfaType::Auto,
639                             slots,
640                             text,
641                             s,
642                             e,
643                         ),
644                         dfa::Result::NoMatch(_) => None,
645                         dfa::Result::Quit => {
646                             self.captures_nfa(slots, text, start)
647                         }
648                     }
649                 }
650             }
651             #[cfg(feature = "perf-dfa")]
652             MatchType::DfaAnchoredReverse => {
653                 match self.find_dfa_anchored_reverse(text, start) {
654                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
655                         MatchNfaType::Auto,
656                         slots,
657                         text,
658                         s,
659                         e,
660                     ),
661                     dfa::Result::NoMatch(_) => None,
662                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
663                 }
664             }
665             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
666             MatchType::DfaSuffix => {
667                 match self.find_dfa_reverse_suffix(text, start) {
668                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
669                         MatchNfaType::Auto,
670                         slots,
671                         text,
672                         s,
673                         e,
674                     ),
675                     dfa::Result::NoMatch(_) => None,
676                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
677                 }
678             }
679             MatchType::Nfa(ty) => {
680                 self.captures_nfa_type(ty, slots, text, start, text.len())
681             }
682             MatchType::Nothing => None,
683             #[cfg(feature = "perf-dfa")]
684             MatchType::DfaMany => {
685                 unreachable!("BUG: RegexSet cannot be used with captures")
686             }
687         }
688     }
689 }
690 
691 impl<'c> ExecNoSync<'c> {
692     /// Finds the leftmost-first match using only literal search.
693     #[cfg(feature = "perf-literal")]
694     #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>695     fn find_literals(
696         &self,
697         ty: MatchLiteralType,
698         text: &[u8],
699         start: usize,
700     ) -> Option<(usize, usize)> {
701         use self::MatchLiteralType::*;
702         match ty {
703             Unanchored => {
704                 let lits = &self.ro.nfa.prefixes;
705                 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
706             }
707             AnchoredStart => {
708                 let lits = &self.ro.nfa.prefixes;
709                 if start == 0 || !self.ro.nfa.is_anchored_start {
710                     lits.find_start(&text[start..])
711                         .map(|(s, e)| (start + s, start + e))
712                 } else {
713                     None
714                 }
715             }
716             AnchoredEnd => {
717                 let lits = &self.ro.suffixes;
718                 lits.find_end(&text[start..])
719                     .map(|(s, e)| (start + s, start + e))
720             }
721             AhoCorasick => self
722                 .ro
723                 .ac
724                 .as_ref()
725                 .unwrap()
726                 .find(&text[start..])
727                 .map(|m| (start + m.start(), start + m.end())),
728         }
729     }
730 
731     /// Finds the leftmost-first match (start and end) using only the DFA.
732     ///
733     /// If the result returned indicates that the DFA quit, then another
734     /// matching engine should be used.
735     #[cfg(feature = "perf-dfa")]
736     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>737     fn find_dfa_forward(
738         &self,
739         text: &[u8],
740         start: usize,
741     ) -> dfa::Result<(usize, usize)> {
742         use dfa::Result::*;
743         let end = match dfa::Fsm::forward(
744             &self.ro.dfa,
745             self.cache.value(),
746             false,
747             text,
748             start,
749         ) {
750             NoMatch(i) => return NoMatch(i),
751             Quit => return Quit,
752             Match(end) if start == end => return Match((start, start)),
753             Match(end) => end,
754         };
755         // Now run the DFA in reverse to find the start of the match.
756         match dfa::Fsm::reverse(
757             &self.ro.dfa_reverse,
758             self.cache.value(),
759             false,
760             &text[start..],
761             end - start,
762         ) {
763             Match(s) => Match((start + s, end)),
764             NoMatch(i) => NoMatch(i),
765             Quit => Quit,
766         }
767     }
768 
769     /// Finds the leftmost-first match (start and end) using only the DFA,
770     /// but assumes the regex is anchored at the end and therefore starts at
771     /// the end of the regex and matches in reverse.
772     ///
773     /// If the result returned indicates that the DFA quit, then another
774     /// matching engine should be used.
775     #[cfg(feature = "perf-dfa")]
776     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>777     fn find_dfa_anchored_reverse(
778         &self,
779         text: &[u8],
780         start: usize,
781     ) -> dfa::Result<(usize, usize)> {
782         use dfa::Result::*;
783         match dfa::Fsm::reverse(
784             &self.ro.dfa_reverse,
785             self.cache.value(),
786             false,
787             &text[start..],
788             text.len() - start,
789         ) {
790             Match(s) => Match((start + s, text.len())),
791             NoMatch(i) => NoMatch(i),
792             Quit => Quit,
793         }
794     }
795 
796     /// Finds the end of the shortest match using only the DFA.
797     #[cfg(feature = "perf-dfa")]
798     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>799     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
800         dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
801     }
802 
803     /// Finds the end of the shortest match using only the DFA by scanning for
804     /// suffix literals.
805     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
806     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>807     fn shortest_dfa_reverse_suffix(
808         &self,
809         text: &[u8],
810         start: usize,
811     ) -> dfa::Result<usize> {
812         match self.exec_dfa_reverse_suffix(text, start) {
813             None => self.shortest_dfa(text, start),
814             Some(r) => r.map(|(_, end)| end),
815         }
816     }
817 
818     /// Finds the end of the shortest match using only the DFA by scanning for
819     /// suffix literals. It also reports the start of the match.
820     ///
821     /// Note that if None is returned, then the optimization gave up to avoid
822     /// worst case quadratic behavior. A forward scanning DFA should be tried
823     /// next.
824     ///
825     /// If a match is returned and the full leftmost-first match is desired,
826     /// then a forward scan starting from the beginning of the match must be
827     /// done.
828     ///
829     /// If the result returned indicates that the DFA quit, then another
830     /// matching engine should be used.
831     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
832     #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>833     fn exec_dfa_reverse_suffix(
834         &self,
835         text: &[u8],
836         original_start: usize,
837     ) -> Option<dfa::Result<(usize, usize)>> {
838         use dfa::Result::*;
839 
840         let lcs = self.ro.suffixes.lcs();
841         debug_assert!(lcs.len() >= 1);
842         let mut start = original_start;
843         let mut end = start;
844         let mut last_literal = start;
845         while end <= text.len() {
846             last_literal += match lcs.find(&text[last_literal..]) {
847                 None => return Some(NoMatch(text.len())),
848                 Some(i) => i,
849             };
850             end = last_literal + lcs.len();
851             match dfa::Fsm::reverse(
852                 &self.ro.dfa_reverse,
853                 self.cache.value(),
854                 false,
855                 &text[start..end],
856                 end - start,
857             ) {
858                 Match(0) | NoMatch(0) => return None,
859                 Match(i) => return Some(Match((start + i, end))),
860                 NoMatch(i) => {
861                     start += i;
862                     last_literal += 1;
863                     continue;
864                 }
865                 Quit => return Some(Quit),
866             };
867         }
868         Some(NoMatch(text.len()))
869     }
870 
871     /// Finds the leftmost-first match (start and end) using only the DFA
872     /// by scanning for suffix literals.
873     ///
874     /// If the result returned indicates that the DFA quit, then another
875     /// matching engine should be used.
876     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
877     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>878     fn find_dfa_reverse_suffix(
879         &self,
880         text: &[u8],
881         start: usize,
882     ) -> dfa::Result<(usize, usize)> {
883         use dfa::Result::*;
884 
885         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
886             None => return self.find_dfa_forward(text, start),
887             Some(Match((start, _))) => start,
888             Some(r) => return r,
889         };
890         // At this point, we've found a match. The only way to quit now
891         // without a match is if the DFA gives up (seems unlikely).
892         //
893         // Now run the DFA forwards to find the proper end of the match.
894         // (The suffix literal match can only indicate the earliest
895         // possible end location, which may appear before the end of the
896         // leftmost-first match.)
897         match dfa::Fsm::forward(
898             &self.ro.dfa,
899             self.cache.value(),
900             false,
901             text,
902             match_start,
903         ) {
904             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
905             Quit => Quit,
906             Match(e) => Match((match_start, e)),
907         }
908     }
909 
910     /// Executes the NFA engine to return whether there is a match or not.
911     ///
912     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
913     /// performance characteristics, but regex sets don't have captures, which
914     /// shortest_nfa depends on.
915     #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool916     fn match_nfa(&self, text: &[u8], start: usize) -> bool {
917         self.match_nfa_type(MatchNfaType::Auto, text, start)
918     }
919 
920     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool921     fn match_nfa_type(
922         &self,
923         ty: MatchNfaType,
924         text: &[u8],
925         start: usize,
926     ) -> bool {
927         self.exec_nfa(
928             ty,
929             &mut [false],
930             &mut [],
931             true,
932             false,
933             text,
934             start,
935             text.len(),
936         )
937     }
938 
939     /// Finds the shortest match using an NFA.
940     #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>941     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
942         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
943     }
944 
945     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>946     fn shortest_nfa_type(
947         &self,
948         ty: MatchNfaType,
949         text: &[u8],
950         start: usize,
951     ) -> Option<usize> {
952         let mut slots = [None, None];
953         if self.exec_nfa(
954             ty,
955             &mut [false],
956             &mut slots,
957             true,
958             true,
959             text,
960             start,
961             text.len(),
962         ) {
963             slots[1]
964         } else {
965             None
966         }
967     }
968 
969     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>970     fn find_nfa(
971         &self,
972         ty: MatchNfaType,
973         text: &[u8],
974         start: usize,
975     ) -> Option<(usize, usize)> {
976         let mut slots = [None, None];
977         if self.exec_nfa(
978             ty,
979             &mut [false],
980             &mut slots,
981             false,
982             false,
983             text,
984             start,
985             text.len(),
986         ) {
987             match (slots[0], slots[1]) {
988                 (Some(s), Some(e)) => Some((s, e)),
989                 _ => None,
990             }
991         } else {
992             None
993         }
994     }
995 
996     /// Like find_nfa, but fills in captures.
997     ///
998     /// `slots` should have length equal to `2 * nfa.captures.len()`.
999     #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>1000     fn captures_nfa(
1001         &self,
1002         slots: &mut [Slot],
1003         text: &[u8],
1004         start: usize,
1005     ) -> Option<(usize, usize)> {
1006         self.captures_nfa_type(
1007             MatchNfaType::Auto,
1008             slots,
1009             text,
1010             start,
1011             text.len(),
1012         )
1013     }
1014 
1015     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1016     fn captures_nfa_type(
1017         &self,
1018         ty: MatchNfaType,
1019         slots: &mut [Slot],
1020         text: &[u8],
1021         start: usize,
1022         end: usize,
1023     ) -> Option<(usize, usize)> {
1024         if self.exec_nfa(
1025             ty,
1026             &mut [false],
1027             slots,
1028             false,
1029             false,
1030             text,
1031             start,
1032             end,
1033         ) {
1034             match (slots[0], slots[1]) {
1035                 (Some(s), Some(e)) => Some((s, e)),
1036                 _ => None,
1037             }
1038         } else {
1039             None
1040         }
1041     }
1042 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1043     fn exec_nfa(
1044         &self,
1045         mut ty: MatchNfaType,
1046         matches: &mut [bool],
1047         slots: &mut [Slot],
1048         quit_after_match: bool,
1049         quit_after_match_with_pos: bool,
1050         text: &[u8],
1051         start: usize,
1052         end: usize,
1053     ) -> bool {
1054         use self::MatchNfaType::*;
1055         if let Auto = ty {
1056             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1057                 ty = Backtrack;
1058             } else {
1059                 ty = PikeVM;
1060             }
1061         }
1062         // The backtracker can't return the shortest match position as it is
1063         // implemented today. So if someone calls `shortest_match` and we need
1064         // to run an NFA, then use the PikeVM.
1065         if quit_after_match_with_pos || ty == PikeVM {
1066             self.exec_pikevm(
1067                 matches,
1068                 slots,
1069                 quit_after_match,
1070                 text,
1071                 start,
1072                 end,
1073             )
1074         } else {
1075             self.exec_backtrack(matches, slots, text, start, end)
1076         }
1077     }
1078 
1079     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1080     fn exec_pikevm(
1081         &self,
1082         matches: &mut [bool],
1083         slots: &mut [Slot],
1084         quit_after_match: bool,
1085         text: &[u8],
1086         start: usize,
1087         end: usize,
1088     ) -> bool {
1089         if self.ro.nfa.uses_bytes() {
1090             pikevm::Fsm::exec(
1091                 &self.ro.nfa,
1092                 self.cache.value(),
1093                 matches,
1094                 slots,
1095                 quit_after_match,
1096                 ByteInput::new(text, self.ro.nfa.only_utf8),
1097                 start,
1098                 end,
1099             )
1100         } else {
1101             pikevm::Fsm::exec(
1102                 &self.ro.nfa,
1103                 self.cache.value(),
1104                 matches,
1105                 slots,
1106                 quit_after_match,
1107                 CharInput::new(text),
1108                 start,
1109                 end,
1110             )
1111         }
1112     }
1113 
1114     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1115     fn exec_backtrack(
1116         &self,
1117         matches: &mut [bool],
1118         slots: &mut [Slot],
1119         text: &[u8],
1120         start: usize,
1121         end: usize,
1122     ) -> bool {
1123         if self.ro.nfa.uses_bytes() {
1124             backtrack::Bounded::exec(
1125                 &self.ro.nfa,
1126                 self.cache.value(),
1127                 matches,
1128                 slots,
1129                 ByteInput::new(text, self.ro.nfa.only_utf8),
1130                 start,
1131                 end,
1132             )
1133         } else {
1134             backtrack::Bounded::exec(
1135                 &self.ro.nfa,
1136                 self.cache.value(),
1137                 matches,
1138                 slots,
1139                 CharInput::new(text),
1140                 start,
1141                 end,
1142             )
1143         }
1144     }
1145 
1146     /// Finds which regular expressions match the given text.
1147     ///
1148     /// `matches` should have length equal to the number of regexes being
1149     /// searched.
1150     ///
1151     /// This is only useful when one wants to know which regexes in a set
1152     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1153     pub fn many_matches_at(
1154         &self,
1155         matches: &mut [bool],
1156         text: &[u8],
1157         start: usize,
1158     ) -> bool {
1159         use self::MatchType::*;
1160         if !self.is_anchor_end_match(text) {
1161             return false;
1162         }
1163         match self.ro.match_type {
1164             #[cfg(feature = "perf-literal")]
1165             Literal(ty) => {
1166                 debug_assert_eq!(matches.len(), 1);
1167                 matches[0] = self.find_literals(ty, text, start).is_some();
1168                 matches[0]
1169             }
1170             #[cfg(feature = "perf-dfa")]
1171             Dfa | DfaAnchoredReverse | DfaMany => {
1172                 match dfa::Fsm::forward_many(
1173                     &self.ro.dfa,
1174                     self.cache.value(),
1175                     matches,
1176                     text,
1177                     start,
1178                 ) {
1179                     dfa::Result::Match(_) => true,
1180                     dfa::Result::NoMatch(_) => false,
1181                     dfa::Result::Quit => self.exec_nfa(
1182                         MatchNfaType::Auto,
1183                         matches,
1184                         &mut [],
1185                         false,
1186                         false,
1187                         text,
1188                         start,
1189                         text.len(),
1190                     ),
1191                 }
1192             }
1193             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1194             DfaSuffix => {
1195                 match dfa::Fsm::forward_many(
1196                     &self.ro.dfa,
1197                     self.cache.value(),
1198                     matches,
1199                     text,
1200                     start,
1201                 ) {
1202                     dfa::Result::Match(_) => true,
1203                     dfa::Result::NoMatch(_) => false,
1204                     dfa::Result::Quit => self.exec_nfa(
1205                         MatchNfaType::Auto,
1206                         matches,
1207                         &mut [],
1208                         false,
1209                         false,
1210                         text,
1211                         start,
1212                         text.len(),
1213                     ),
1214                 }
1215             }
1216             Nfa(ty) => self.exec_nfa(
1217                 ty,
1218                 matches,
1219                 &mut [],
1220                 false,
1221                 false,
1222                 text,
1223                 start,
1224                 text.len(),
1225             ),
1226             Nothing => false,
1227         }
1228     }
1229 
1230     #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1231     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1232         #[cfg(not(feature = "perf-literal"))]
1233         fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1234             true
1235         }
1236 
1237         #[cfg(feature = "perf-literal")]
1238         fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1239             // Only do this check if the haystack is big (>1MB).
1240             if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1241                 let lcs = ro.suffixes.lcs();
1242                 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1243                     return false;
1244                 }
1245             }
1246             true
1247         }
1248 
1249         imp(&self.ro, text)
1250     }
1251 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1252     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1253         &self.ro.nfa.capture_name_idx
1254     }
1255 }
1256 
1257 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1258     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1259         self.0.capture_name_idx()
1260     }
1261 }
1262 
1263 impl Exec {
1264     /// Get a searcher that isn't Sync.
1265     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1266     pub fn searcher(&self) -> ExecNoSync {
1267         ExecNoSync {
1268             ro: &self.ro, // a clone is too expensive here! (and not needed)
1269             cache: self.pool.get(),
1270         }
1271     }
1272 
1273     /// Get a searcher that isn't Sync and can match on &str.
1274     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1275     pub fn searcher_str(&self) -> ExecNoSyncStr {
1276         ExecNoSyncStr(self.searcher())
1277     }
1278 
1279     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1280     pub fn into_regex(self) -> re_unicode::Regex {
1281         re_unicode::Regex::from(self)
1282     }
1283 
1284     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1285     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1286         re_set::unicode::RegexSet::from(self)
1287     }
1288 
1289     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1290     pub fn into_byte_regex(self) -> re_bytes::Regex {
1291         re_bytes::Regex::from(self)
1292     }
1293 
1294     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1295     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1296         re_set::bytes::RegexSet::from(self)
1297     }
1298 
1299     /// The original regular expressions given by the caller that were
1300     /// compiled.
regex_strings(&self) -> &[String]1301     pub fn regex_strings(&self) -> &[String] {
1302         &self.ro.res
1303     }
1304 
1305     /// Return a slice of capture names.
1306     ///
1307     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1308     pub fn capture_names(&self) -> &[Option<String>] {
1309         &self.ro.nfa.captures
1310     }
1311 
1312     /// Return a reference to named groups mapping (from group name to
1313     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1314     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1315         &self.ro.nfa.capture_name_idx
1316     }
1317 }
1318 
1319 impl Clone for Exec {
clone(&self) -> Exec1320     fn clone(&self) -> Exec {
1321         let pool = ExecReadOnly::new_pool(&self.ro);
1322         Exec { ro: self.ro.clone(), pool }
1323     }
1324 }
1325 
1326 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1327     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1328         if let Some(MatchType::Nfa(_)) = hint {
1329             return hint.unwrap();
1330         }
1331         // If the NFA is empty, then we'll never match anything.
1332         if self.nfa.insts.is_empty() {
1333             return MatchType::Nothing;
1334         }
1335         if let Some(literalty) = self.choose_literal_match_type() {
1336             return literalty;
1337         }
1338         if let Some(dfaty) = self.choose_dfa_match_type() {
1339             return dfaty;
1340         }
1341         // We're so totally hosed.
1342         MatchType::Nfa(MatchNfaType::Auto)
1343     }
1344 
1345     /// If a plain literal scan can be used, then a corresponding literal
1346     /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1347     fn choose_literal_match_type(&self) -> Option<MatchType> {
1348         #[cfg(not(feature = "perf-literal"))]
1349         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1350             None
1351         }
1352 
1353         #[cfg(feature = "perf-literal")]
1354         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1355             // If our set of prefixes is complete, then we can use it to find
1356             // a match in lieu of a regex engine. This doesn't quite work well
1357             // in the presence of multiple regexes, so only do it when there's
1358             // one.
1359             //
1360             // TODO(burntsushi): Also, don't try to match literals if the regex
1361             // is partially anchored. We could technically do it, but we'd need
1362             // to create two sets of literals: all of them and then the subset
1363             // that aren't anchored. We would then only search for all of them
1364             // when at the beginning of the input and use the subset in all
1365             // other cases.
1366             if ro.res.len() != 1 {
1367                 return None;
1368             }
1369             if ro.ac.is_some() {
1370                 return Some(MatchType::Literal(
1371                     MatchLiteralType::AhoCorasick,
1372                 ));
1373             }
1374             if ro.nfa.prefixes.complete() {
1375                 return if ro.nfa.is_anchored_start {
1376                     Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1377                 } else {
1378                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1379                 };
1380             }
1381             if ro.suffixes.complete() {
1382                 return if ro.nfa.is_anchored_end {
1383                     Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1384                 } else {
1385                     // This case shouldn't happen. When the regex isn't
1386                     // anchored, then complete prefixes should imply complete
1387                     // suffixes.
1388                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1389                 };
1390             }
1391             None
1392         }
1393 
1394         imp(self)
1395     }
1396 
1397     /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1398     fn choose_dfa_match_type(&self) -> Option<MatchType> {
1399         #[cfg(not(feature = "perf-dfa"))]
1400         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1401             None
1402         }
1403 
1404         #[cfg(feature = "perf-dfa")]
1405         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1406             if !dfa::can_exec(&ro.dfa) {
1407                 return None;
1408             }
1409             // Regex sets require a slightly specialized path.
1410             if ro.res.len() >= 2 {
1411                 return Some(MatchType::DfaMany);
1412             }
1413             // If the regex is anchored at the end but not the start, then
1414             // just match in reverse from the end of the haystack.
1415             if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1416                 return Some(MatchType::DfaAnchoredReverse);
1417             }
1418             #[cfg(feature = "perf-literal")]
1419             {
1420                 // If there's a longish suffix literal, then it might be faster
1421                 // to look for that first.
1422                 if ro.should_suffix_scan() {
1423                     return Some(MatchType::DfaSuffix);
1424                 }
1425             }
1426             // Fall back to your garden variety forward searching lazy DFA.
1427             Some(MatchType::Dfa)
1428         }
1429 
1430         imp(self)
1431     }
1432 
1433     /// Returns true if the program is amenable to suffix scanning.
1434     ///
1435     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1436     /// for suffix literals and then do a *reverse* DFA match from any matches
1437     /// produced by the literal scan. (And then followed by a forward DFA
1438     /// search, since the previously found suffix literal maybe not actually be
1439     /// the end of a match.)
1440     ///
1441     /// This is a bit of a specialized optimization, but can result in pretty
1442     /// big performance wins if 1) there are no prefix literals and 2) the
1443     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1444     /// account for but (2) is harder. As a proxy, we assume that longer
1445     /// strings are generally rarer, so we only enable this optimization when
1446     /// we have a meaty suffix.
1447     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1448     fn should_suffix_scan(&self) -> bool {
1449         if self.suffixes.is_empty() {
1450             return false;
1451         }
1452         let lcs_len = self.suffixes.lcs().char_len();
1453         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1454     }
1455 
new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>>1456     fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1457         let ro = ro.clone();
1458         Box::new(Pool::new(Box::new(move || {
1459             AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1460         })))
1461     }
1462 }
1463 
1464 #[derive(Clone, Copy, Debug)]
1465 enum MatchType {
1466     /// A single or multiple literal search. This is only used when the regex
1467     /// can be decomposed into a literal search.
1468     #[cfg(feature = "perf-literal")]
1469     Literal(MatchLiteralType),
1470     /// A normal DFA search.
1471     #[cfg(feature = "perf-dfa")]
1472     Dfa,
1473     /// A reverse DFA search starting from the end of a haystack.
1474     #[cfg(feature = "perf-dfa")]
1475     DfaAnchoredReverse,
1476     /// A reverse DFA search with suffix literal scanning.
1477     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1478     DfaSuffix,
1479     /// Use the DFA on two or more regular expressions.
1480     #[cfg(feature = "perf-dfa")]
1481     DfaMany,
1482     /// An NFA variant.
1483     Nfa(MatchNfaType),
1484     /// No match is ever possible, so don't ever try to search.
1485     Nothing,
1486 }
1487 
1488 #[derive(Clone, Copy, Debug)]
1489 #[cfg(feature = "perf-literal")]
1490 enum MatchLiteralType {
1491     /// Match literals anywhere in text.
1492     Unanchored,
1493     /// Match literals only at the start of text.
1494     AnchoredStart,
1495     /// Match literals only at the end of text.
1496     AnchoredEnd,
1497     /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1498     /// ExecReadOnly.
1499     AhoCorasick,
1500 }
1501 
1502 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1503 enum MatchNfaType {
1504     /// Choose between Backtrack and PikeVM.
1505     Auto,
1506     /// NFA bounded backtracking.
1507     ///
1508     /// (This is only set by tests, since it never makes sense to always want
1509     /// backtracking.)
1510     Backtrack,
1511     /// The Pike VM.
1512     ///
1513     /// (This is only set by tests, since it never makes sense to always want
1514     /// the Pike VM.)
1515     PikeVM,
1516 }
1517 
1518 /// `ProgramCache` maintains reusable allocations for each matching engine
1519 /// available to a particular program.
1520 ///
1521 /// We declare this as unwind safe since it's a cache that's only used for
1522 /// performance purposes. If a panic occurs, it is (or should be) always safe
1523 /// to continue using the same regex object.
1524 pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1525 
1526 #[derive(Debug)]
1527 pub struct ProgramCacheInner {
1528     pub pikevm: pikevm::Cache,
1529     pub backtrack: backtrack::Cache,
1530     #[cfg(feature = "perf-dfa")]
1531     pub dfa: dfa::Cache,
1532     #[cfg(feature = "perf-dfa")]
1533     pub dfa_reverse: dfa::Cache,
1534 }
1535 
1536 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1537     fn new(ro: &ExecReadOnly) -> Self {
1538         ProgramCacheInner {
1539             pikevm: pikevm::Cache::new(&ro.nfa),
1540             backtrack: backtrack::Cache::new(&ro.nfa),
1541             #[cfg(feature = "perf-dfa")]
1542             dfa: dfa::Cache::new(&ro.dfa),
1543             #[cfg(feature = "perf-dfa")]
1544             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1545         }
1546     }
1547 }
1548 
1549 /// Alternation literals checks if the given HIR is a simple alternation of
1550 /// literals, and if so, returns them. Otherwise, this returns None.
1551 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1552 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1553     use syntax::hir::{HirKind, Literal};
1554 
1555     // This is pretty hacky, but basically, if `is_alternation_literal` is
1556     // true, then we can make several assumptions about the structure of our
1557     // HIR. This is what justifies the `unreachable!` statements below.
1558     //
1559     // This code should be refactored once we overhaul this crate's
1560     // optimization pipeline, because this is a terribly inflexible way to go
1561     // about things.
1562 
1563     if !expr.is_alternation_literal() {
1564         return None;
1565     }
1566     let alts = match *expr.kind() {
1567         HirKind::Alternation(ref alts) => alts,
1568         _ => return None, // one literal isn't worth it
1569     };
1570 
1571     let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1572         Literal::Unicode(c) => {
1573             let mut buf = [0; 4];
1574             dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1575         }
1576         Literal::Byte(b) => {
1577             dst.push(b);
1578         }
1579     };
1580 
1581     let mut lits = vec![];
1582     for alt in alts {
1583         let mut lit = vec![];
1584         match *alt.kind() {
1585             HirKind::Literal(ref x) => extendlit(x, &mut lit),
1586             HirKind::Concat(ref exprs) => {
1587                 for e in exprs {
1588                     match *e.kind() {
1589                         HirKind::Literal(ref x) => extendlit(x, &mut lit),
1590                         _ => unreachable!("expected literal, got {:?}", e),
1591                     }
1592                 }
1593             }
1594             _ => unreachable!("expected literal or concat, got {:?}", alt),
1595         }
1596         lits.push(lit);
1597     }
1598     Some(lits)
1599 }
1600 
1601 #[cfg(test)]
1602 mod test {
1603     #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1604     fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1605         use internal::ExecBuilder;
1606 
1607         let backtrack_bytes_re = ExecBuilder::new("^S")
1608             .bounded_backtracking()
1609             .only_utf8(false)
1610             .build()
1611             .map(|exec| exec.into_byte_regex())
1612             .map_err(|err| format!("{}", err))
1613             .unwrap();
1614 
1615         let default_bytes_re = ExecBuilder::new("^S")
1616             .only_utf8(false)
1617             .build()
1618             .map(|exec| exec.into_byte_regex())
1619             .map_err(|err| format!("{}", err))
1620             .unwrap();
1621 
1622         let input = vec![83, 83];
1623 
1624         let s1 = backtrack_bytes_re.split(&input);
1625         let s2 = default_bytes_re.split(&input);
1626         for (chunk1, chunk2) in s1.zip(s2) {
1627             assert_eq!(chunk1, chunk2);
1628         }
1629     }
1630 
1631     #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1632     fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1633         use internal::ExecBuilder;
1634 
1635         let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1636             .bounded_backtracking()
1637             .bytes(true)
1638             .build()
1639             .map(|exec| exec.into_regex())
1640             .map_err(|err| format!("{}", err))
1641             .unwrap();
1642 
1643         let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1644             .bytes(true)
1645             .build()
1646             .map(|exec| exec.into_regex())
1647             .map_err(|err| format!("{}", err))
1648             .unwrap();
1649 
1650         let input = "**";
1651 
1652         let s1 = backtrack_bytes_re.split(input);
1653         let s2 = default_bytes_re.split(input);
1654         for (chunk1, chunk2) in s1.zip(s2) {
1655             assert_eq!(chunk1, chunk2);
1656         }
1657     }
1658 }
1659