1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::panic::AssertUnwindSafe;
4 use std::sync::Arc;
5
6 #[cfg(feature = "perf-literal")]
7 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
8 use syntax::hir::literal::Literals;
9 use syntax::hir::Hir;
10 use syntax::ParserBuilder;
11
12 use backtrack;
13 use compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use dfa;
16 use error::Error;
17 use input::{ByteInput, CharInput};
18 use literal::LiteralSearcher;
19 use pikevm;
20 use pool::{Pool, PoolGuard};
21 use prog::Program;
22 use re_builder::RegexOptions;
23 use re_bytes;
24 use re_set;
25 use re_trait::{Locations, RegularExpression, Slot};
26 use re_unicode;
27 use utf8::next_utf8;
28
29 /// `Exec` manages the execution of a regular expression.
30 ///
31 /// In particular, this manages the various compiled forms of a single regular
32 /// expression and the choice of which matching engine to use to execute a
33 /// regular expression.
34 #[derive(Debug)]
35 pub struct Exec {
36 /// All read only state.
37 ro: Arc<ExecReadOnly>,
38 /// A pool of reusable values for the various matching engines.
39 ///
40 /// Note that boxing this value is not strictly necessary, but it is an
41 /// easy way to ensure that T does not bloat the stack sized used by a pool
42 /// in the case where T is big. And this turns out to be the case at the
43 /// time of writing for regex's use of this pool. At the time of writing,
44 /// the size of a Regex on the stack is 856 bytes. Boxing this value
45 /// reduces that size to 16 bytes.
46 pool: Box<Pool<ProgramCache>>,
47 }
48
49 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50 /// means it is no longer Sync, but we can now avoid the overhead of
51 /// synchronization to fetch the cache.
52 #[derive(Debug)]
53 pub struct ExecNoSync<'c> {
54 /// All read only state.
55 ro: &'c Arc<ExecReadOnly>,
56 /// Caches for the various matching engines.
57 cache: PoolGuard<'c, ProgramCache>,
58 }
59
60 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61 #[derive(Debug)]
62 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63
64 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65 /// state is determined at compile time and never changes during search.
66 #[derive(Debug)]
67 struct ExecReadOnly {
68 /// The original regular expressions given by the caller to compile.
69 res: Vec<String>,
70 /// A compiled program that is used in the NFA simulation and backtracking.
71 /// It can be byte-based or Unicode codepoint based.
72 ///
73 /// N.B. It is not possibly to make this byte-based from the public API.
74 /// It is only used for testing byte based programs in the NFA simulations.
75 nfa: Program,
76 /// A compiled byte based program for DFA execution. This is only used
77 /// if a DFA can be executed. (Currently, only word boundary assertions are
78 /// not supported.) Note that this program contains an embedded `.*?`
79 /// preceding the first capture group, unless the regex is anchored at the
80 /// beginning.
81 dfa: Program,
82 /// The same as above, except the program is reversed (and there is no
83 /// preceding `.*?`). This is used by the DFA to find the starting location
84 /// of matches.
85 dfa_reverse: Program,
86 /// A set of suffix literals extracted from the regex.
87 ///
88 /// Prefix literals are stored on the `Program`, since they are used inside
89 /// the matching engines.
90 suffixes: LiteralSearcher,
91 /// An Aho-Corasick automaton with leftmost-first match semantics.
92 ///
93 /// This is only set when the entire regex is a simple unanchored
94 /// alternation of literals. We could probably use it more circumstances,
95 /// but this is already hacky enough in this architecture.
96 ///
97 /// N.B. We use u32 as a state ID representation under the assumption that
98 /// if we were to exhaust the ID space, we probably would have long
99 /// surpassed the compilation size limit.
100 #[cfg(feature = "perf-literal")]
101 ac: Option<AhoCorasick<u32>>,
102 /// match_type encodes as much upfront knowledge about how we're going to
103 /// execute a search as possible.
104 match_type: MatchType,
105 }
106
107 /// Facilitates the construction of an executor by exposing various knobs
108 /// to control how a regex is executed and what kinds of resources it's
109 /// permitted to use.
110 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
111 // `Debug`.
112 #[allow(missing_debug_implementations)]
113 pub struct ExecBuilder {
114 options: RegexOptions,
115 match_type: Option<MatchType>,
116 bytes: bool,
117 only_utf8: bool,
118 }
119
120 /// Parsed represents a set of parsed regular expressions and their detected
121 /// literals.
122 struct Parsed {
123 exprs: Vec<Hir>,
124 prefixes: Literals,
125 suffixes: Literals,
126 bytes: bool,
127 }
128
129 impl ExecBuilder {
130 /// Create a regex execution builder.
131 ///
132 /// This uses default settings for everything except the regex itself,
133 /// which must be provided. Further knobs can be set by calling methods,
134 /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self135 pub fn new(re: &str) -> Self {
136 Self::new_many(&[re])
137 }
138
139 /// Like new, but compiles the union of the given regular expressions.
140 ///
141 /// Note that when compiling 2 or more regular expressions, capture groups
142 /// are completely unsupported. (This means both `find` and `captures`
143 /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,144 pub fn new_many<I, S>(res: I) -> Self
145 where
146 S: AsRef<str>,
147 I: IntoIterator<Item = S>,
148 {
149 let mut opts = RegexOptions::default();
150 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
151 Self::new_options(opts)
152 }
153
154 /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self155 pub fn new_options(opts: RegexOptions) -> Self {
156 ExecBuilder {
157 options: opts,
158 match_type: None,
159 bytes: false,
160 only_utf8: true,
161 }
162 }
163
164 /// Set the matching engine to be automatically determined.
165 ///
166 /// This is the default state and will apply whatever optimizations are
167 /// possible, such as running a DFA.
168 ///
169 /// This overrides whatever was previously set via the `nfa` or
170 /// `bounded_backtracking` methods.
automatic(mut self) -> Self171 pub fn automatic(mut self) -> Self {
172 self.match_type = None;
173 self
174 }
175
176 /// Sets the matching engine to use the NFA algorithm no matter what
177 /// optimizations are possible.
178 ///
179 /// This overrides whatever was previously set via the `automatic` or
180 /// `bounded_backtracking` methods.
nfa(mut self) -> Self181 pub fn nfa(mut self) -> Self {
182 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
183 self
184 }
185
186 /// Sets the matching engine to use a bounded backtracking engine no
187 /// matter what optimizations are possible.
188 ///
189 /// One must use this with care, since the bounded backtracking engine
190 /// uses memory proportion to `len(regex) * len(text)`.
191 ///
192 /// This overrides whatever was previously set via the `automatic` or
193 /// `nfa` methods.
bounded_backtracking(mut self) -> Self194 pub fn bounded_backtracking(mut self) -> Self {
195 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
196 self
197 }
198
199 /// Compiles byte based programs for use with the NFA matching engines.
200 ///
201 /// By default, the NFA engines match on Unicode scalar values. They can
202 /// be made to use byte based programs instead. In general, the byte based
203 /// programs are slower because of a less efficient encoding of character
204 /// classes.
205 ///
206 /// Note that this does not impact DFA matching engines, which always
207 /// execute on bytes.
bytes(mut self, yes: bool) -> Self208 pub fn bytes(mut self, yes: bool) -> Self {
209 self.bytes = yes;
210 self
211 }
212
213 /// When disabled, the program compiled may match arbitrary bytes.
214 ///
215 /// When enabled (the default), all compiled programs exclusively match
216 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self217 pub fn only_utf8(mut self, yes: bool) -> Self {
218 self.only_utf8 = yes;
219 self
220 }
221
222 /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self223 pub fn unicode(mut self, yes: bool) -> Self {
224 self.options.unicode = yes;
225 self
226 }
227
228 /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>229 fn parse(&self) -> Result<Parsed, Error> {
230 let mut exprs = Vec::with_capacity(self.options.pats.len());
231 let mut prefixes = Some(Literals::empty());
232 let mut suffixes = Some(Literals::empty());
233 let mut bytes = false;
234 let is_set = self.options.pats.len() > 1;
235 // If we're compiling a regex set and that set has any anchored
236 // expressions, then disable all literal optimizations.
237 for pat in &self.options.pats {
238 let mut parser = ParserBuilder::new()
239 .octal(self.options.octal)
240 .case_insensitive(self.options.case_insensitive)
241 .multi_line(self.options.multi_line)
242 .dot_matches_new_line(self.options.dot_matches_new_line)
243 .swap_greed(self.options.swap_greed)
244 .ignore_whitespace(self.options.ignore_whitespace)
245 .unicode(self.options.unicode)
246 .allow_invalid_utf8(!self.only_utf8)
247 .nest_limit(self.options.nest_limit)
248 .build();
249 let expr =
250 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
251 bytes = bytes || !expr.is_always_utf8();
252
253 if cfg!(feature = "perf-literal") {
254 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
255 // Partial anchors unfortunately make it hard to use
256 // prefixes, so disable them.
257 prefixes = None;
258 } else if is_set && expr.is_anchored_start() {
259 // Regex sets with anchors do not go well with literal
260 // optimizations.
261 prefixes = None;
262 }
263 prefixes = prefixes.and_then(|mut prefixes| {
264 if !prefixes.union_prefixes(&expr) {
265 None
266 } else {
267 Some(prefixes)
268 }
269 });
270
271 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
272 // Partial anchors unfortunately make it hard to use
273 // suffixes, so disable them.
274 suffixes = None;
275 } else if is_set && expr.is_anchored_end() {
276 // Regex sets with anchors do not go well with literal
277 // optimizations.
278 suffixes = None;
279 }
280 suffixes = suffixes.and_then(|mut suffixes| {
281 if !suffixes.union_suffixes(&expr) {
282 None
283 } else {
284 Some(suffixes)
285 }
286 });
287 }
288 exprs.push(expr);
289 }
290 Ok(Parsed {
291 exprs: exprs,
292 prefixes: prefixes.unwrap_or_else(Literals::empty),
293 suffixes: suffixes.unwrap_or_else(Literals::empty),
294 bytes: bytes,
295 })
296 }
297
298 /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>299 pub fn build(self) -> Result<Exec, Error> {
300 // Special case when we have no patterns to compile.
301 // This can happen when compiling a regex set.
302 if self.options.pats.is_empty() {
303 let ro = Arc::new(ExecReadOnly {
304 res: vec![],
305 nfa: Program::new(),
306 dfa: Program::new(),
307 dfa_reverse: Program::new(),
308 suffixes: LiteralSearcher::empty(),
309 #[cfg(feature = "perf-literal")]
310 ac: None,
311 match_type: MatchType::Nothing,
312 });
313 let pool = ExecReadOnly::new_pool(&ro);
314 return Ok(Exec { ro: ro, pool });
315 }
316 let parsed = self.parse()?;
317 let mut nfa = Compiler::new()
318 .size_limit(self.options.size_limit)
319 .bytes(self.bytes || parsed.bytes)
320 .only_utf8(self.only_utf8)
321 .compile(&parsed.exprs)?;
322 let mut dfa = Compiler::new()
323 .size_limit(self.options.size_limit)
324 .dfa(true)
325 .only_utf8(self.only_utf8)
326 .compile(&parsed.exprs)?;
327 let mut dfa_reverse = Compiler::new()
328 .size_limit(self.options.size_limit)
329 .dfa(true)
330 .only_utf8(self.only_utf8)
331 .reverse(true)
332 .compile(&parsed.exprs)?;
333
334 #[cfg(feature = "perf-literal")]
335 let ac = self.build_aho_corasick(&parsed);
336 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
337 dfa.prefixes = nfa.prefixes.clone();
338 dfa.dfa_size_limit = self.options.dfa_size_limit;
339 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
340
341 let mut ro = ExecReadOnly {
342 res: self.options.pats,
343 nfa: nfa,
344 dfa: dfa,
345 dfa_reverse: dfa_reverse,
346 suffixes: LiteralSearcher::suffixes(parsed.suffixes),
347 #[cfg(feature = "perf-literal")]
348 ac: ac,
349 match_type: MatchType::Nothing,
350 };
351 ro.match_type = ro.choose_match_type(self.match_type);
352
353 let ro = Arc::new(ro);
354 let pool = ExecReadOnly::new_pool(&ro);
355 Ok(Exec { ro, pool })
356 }
357
358 #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>359 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
360 if parsed.exprs.len() != 1 {
361 return None;
362 }
363 let lits = match alternation_literals(&parsed.exprs[0]) {
364 None => return None,
365 Some(lits) => lits,
366 };
367 // If we have a small number of literals, then let Teddy handle
368 // things (see literal/mod.rs).
369 if lits.len() <= 32 {
370 return None;
371 }
372 Some(
373 AhoCorasickBuilder::new()
374 .match_kind(MatchKind::LeftmostFirst)
375 .auto_configure(&lits)
376 // We always want this to reduce size, regardless
377 // of what auto-configure does.
378 .byte_classes(true)
379 .build_with_size::<u32, _, _>(&lits)
380 // This should never happen because we'd long exceed the
381 // compilation limit for regexes first.
382 .expect("AC automaton too big"),
383 )
384 }
385 }
386
387 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
388 type Text = str;
389
slots_len(&self) -> usize390 fn slots_len(&self) -> usize {
391 self.0.slots_len()
392 }
393
next_after_empty(&self, text: &str, i: usize) -> usize394 fn next_after_empty(&self, text: &str, i: usize) -> usize {
395 next_utf8(text.as_bytes(), i)
396 }
397
398 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>399 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
400 self.0.shortest_match_at(text.as_bytes(), start)
401 }
402
403 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool404 fn is_match_at(&self, text: &str, start: usize) -> bool {
405 self.0.is_match_at(text.as_bytes(), start)
406 }
407
408 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>409 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
410 self.0.find_at(text.as_bytes(), start)
411 }
412
413 #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>414 fn captures_read_at(
415 &self,
416 locs: &mut Locations,
417 text: &str,
418 start: usize,
419 ) -> Option<(usize, usize)> {
420 self.0.captures_read_at(locs, text.as_bytes(), start)
421 }
422 }
423
424 impl<'c> RegularExpression for ExecNoSync<'c> {
425 type Text = [u8];
426
427 /// Returns the number of capture slots in the regular expression. (There
428 /// are two slots for every capture group, corresponding to possibly empty
429 /// start and end locations of the capture.)
slots_len(&self) -> usize430 fn slots_len(&self) -> usize {
431 self.ro.nfa.captures.len() * 2
432 }
433
next_after_empty(&self, _text: &[u8], i: usize) -> usize434 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
435 i + 1
436 }
437
438 /// Returns the end of a match location, possibly occurring before the
439 /// end location of the correct leftmost-first match.
440 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>441 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
442 if !self.is_anchor_end_match(text) {
443 return None;
444 }
445 match self.ro.match_type {
446 #[cfg(feature = "perf-literal")]
447 MatchType::Literal(ty) => {
448 self.find_literals(ty, text, start).map(|(_, e)| e)
449 }
450 #[cfg(feature = "perf-dfa")]
451 MatchType::Dfa | MatchType::DfaMany => {
452 match self.shortest_dfa(text, start) {
453 dfa::Result::Match(end) => Some(end),
454 dfa::Result::NoMatch(_) => None,
455 dfa::Result::Quit => self.shortest_nfa(text, start),
456 }
457 }
458 #[cfg(feature = "perf-dfa")]
459 MatchType::DfaAnchoredReverse => {
460 match dfa::Fsm::reverse(
461 &self.ro.dfa_reverse,
462 self.cache.value(),
463 true,
464 &text[start..],
465 text.len(),
466 ) {
467 dfa::Result::Match(_) => Some(text.len()),
468 dfa::Result::NoMatch(_) => None,
469 dfa::Result::Quit => self.shortest_nfa(text, start),
470 }
471 }
472 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
473 MatchType::DfaSuffix => {
474 match self.shortest_dfa_reverse_suffix(text, start) {
475 dfa::Result::Match(e) => Some(e),
476 dfa::Result::NoMatch(_) => None,
477 dfa::Result::Quit => self.shortest_nfa(text, start),
478 }
479 }
480 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
481 MatchType::Nothing => None,
482 }
483 }
484
485 /// Returns true if and only if the regex matches text.
486 ///
487 /// For single regular expressions, this is equivalent to calling
488 /// shortest_match(...).is_some().
489 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool490 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
491 if !self.is_anchor_end_match(text) {
492 return false;
493 }
494 // We need to do this dance because shortest_match relies on the NFA
495 // filling in captures[1], but a RegexSet has no captures. In other
496 // words, a RegexSet can't (currently) use shortest_match. ---AG
497 match self.ro.match_type {
498 #[cfg(feature = "perf-literal")]
499 MatchType::Literal(ty) => {
500 self.find_literals(ty, text, start).is_some()
501 }
502 #[cfg(feature = "perf-dfa")]
503 MatchType::Dfa | MatchType::DfaMany => {
504 match self.shortest_dfa(text, start) {
505 dfa::Result::Match(_) => true,
506 dfa::Result::NoMatch(_) => false,
507 dfa::Result::Quit => self.match_nfa(text, start),
508 }
509 }
510 #[cfg(feature = "perf-dfa")]
511 MatchType::DfaAnchoredReverse => {
512 match dfa::Fsm::reverse(
513 &self.ro.dfa_reverse,
514 self.cache.value(),
515 true,
516 &text[start..],
517 text.len(),
518 ) {
519 dfa::Result::Match(_) => true,
520 dfa::Result::NoMatch(_) => false,
521 dfa::Result::Quit => self.match_nfa(text, start),
522 }
523 }
524 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
525 MatchType::DfaSuffix => {
526 match self.shortest_dfa_reverse_suffix(text, start) {
527 dfa::Result::Match(_) => true,
528 dfa::Result::NoMatch(_) => false,
529 dfa::Result::Quit => self.match_nfa(text, start),
530 }
531 }
532 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
533 MatchType::Nothing => false,
534 }
535 }
536
537 /// Finds the start and end location of the leftmost-first match, starting
538 /// at the given location.
539 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>540 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
541 if !self.is_anchor_end_match(text) {
542 return None;
543 }
544 match self.ro.match_type {
545 #[cfg(feature = "perf-literal")]
546 MatchType::Literal(ty) => self.find_literals(ty, text, start),
547 #[cfg(feature = "perf-dfa")]
548 MatchType::Dfa => match self.find_dfa_forward(text, start) {
549 dfa::Result::Match((s, e)) => Some((s, e)),
550 dfa::Result::NoMatch(_) => None,
551 dfa::Result::Quit => {
552 self.find_nfa(MatchNfaType::Auto, text, start)
553 }
554 },
555 #[cfg(feature = "perf-dfa")]
556 MatchType::DfaAnchoredReverse => {
557 match self.find_dfa_anchored_reverse(text, start) {
558 dfa::Result::Match((s, e)) => Some((s, e)),
559 dfa::Result::NoMatch(_) => None,
560 dfa::Result::Quit => {
561 self.find_nfa(MatchNfaType::Auto, text, start)
562 }
563 }
564 }
565 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
566 MatchType::DfaSuffix => {
567 match self.find_dfa_reverse_suffix(text, start) {
568 dfa::Result::Match((s, e)) => Some((s, e)),
569 dfa::Result::NoMatch(_) => None,
570 dfa::Result::Quit => {
571 self.find_nfa(MatchNfaType::Auto, text, start)
572 }
573 }
574 }
575 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
576 MatchType::Nothing => None,
577 #[cfg(feature = "perf-dfa")]
578 MatchType::DfaMany => {
579 unreachable!("BUG: RegexSet cannot be used with find")
580 }
581 }
582 }
583
584 /// Finds the start and end location of the leftmost-first match and also
585 /// fills in all matching capture groups.
586 ///
587 /// The number of capture slots given should be equal to the total number
588 /// of capture slots in the compiled program.
589 ///
590 /// Note that the first two slots always correspond to the start and end
591 /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>592 fn captures_read_at(
593 &self,
594 locs: &mut Locations,
595 text: &[u8],
596 start: usize,
597 ) -> Option<(usize, usize)> {
598 let slots = locs.as_slots();
599 for slot in slots.iter_mut() {
600 *slot = None;
601 }
602 // If the caller unnecessarily uses this, then we try to save them
603 // from themselves.
604 match slots.len() {
605 0 => return self.find_at(text, start),
606 2 => {
607 return self.find_at(text, start).map(|(s, e)| {
608 slots[0] = Some(s);
609 slots[1] = Some(e);
610 (s, e)
611 });
612 }
613 _ => {} // fallthrough
614 }
615 if !self.is_anchor_end_match(text) {
616 return None;
617 }
618 match self.ro.match_type {
619 #[cfg(feature = "perf-literal")]
620 MatchType::Literal(ty) => {
621 self.find_literals(ty, text, start).and_then(|(s, e)| {
622 self.captures_nfa_type(
623 MatchNfaType::Auto,
624 slots,
625 text,
626 s,
627 e,
628 )
629 })
630 }
631 #[cfg(feature = "perf-dfa")]
632 MatchType::Dfa => {
633 if self.ro.nfa.is_anchored_start {
634 self.captures_nfa(slots, text, start)
635 } else {
636 match self.find_dfa_forward(text, start) {
637 dfa::Result::Match((s, e)) => self.captures_nfa_type(
638 MatchNfaType::Auto,
639 slots,
640 text,
641 s,
642 e,
643 ),
644 dfa::Result::NoMatch(_) => None,
645 dfa::Result::Quit => {
646 self.captures_nfa(slots, text, start)
647 }
648 }
649 }
650 }
651 #[cfg(feature = "perf-dfa")]
652 MatchType::DfaAnchoredReverse => {
653 match self.find_dfa_anchored_reverse(text, start) {
654 dfa::Result::Match((s, e)) => self.captures_nfa_type(
655 MatchNfaType::Auto,
656 slots,
657 text,
658 s,
659 e,
660 ),
661 dfa::Result::NoMatch(_) => None,
662 dfa::Result::Quit => self.captures_nfa(slots, text, start),
663 }
664 }
665 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
666 MatchType::DfaSuffix => {
667 match self.find_dfa_reverse_suffix(text, start) {
668 dfa::Result::Match((s, e)) => self.captures_nfa_type(
669 MatchNfaType::Auto,
670 slots,
671 text,
672 s,
673 e,
674 ),
675 dfa::Result::NoMatch(_) => None,
676 dfa::Result::Quit => self.captures_nfa(slots, text, start),
677 }
678 }
679 MatchType::Nfa(ty) => {
680 self.captures_nfa_type(ty, slots, text, start, text.len())
681 }
682 MatchType::Nothing => None,
683 #[cfg(feature = "perf-dfa")]
684 MatchType::DfaMany => {
685 unreachable!("BUG: RegexSet cannot be used with captures")
686 }
687 }
688 }
689 }
690
691 impl<'c> ExecNoSync<'c> {
692 /// Finds the leftmost-first match using only literal search.
693 #[cfg(feature = "perf-literal")]
694 #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>695 fn find_literals(
696 &self,
697 ty: MatchLiteralType,
698 text: &[u8],
699 start: usize,
700 ) -> Option<(usize, usize)> {
701 use self::MatchLiteralType::*;
702 match ty {
703 Unanchored => {
704 let lits = &self.ro.nfa.prefixes;
705 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
706 }
707 AnchoredStart => {
708 let lits = &self.ro.nfa.prefixes;
709 if start == 0 || !self.ro.nfa.is_anchored_start {
710 lits.find_start(&text[start..])
711 .map(|(s, e)| (start + s, start + e))
712 } else {
713 None
714 }
715 }
716 AnchoredEnd => {
717 let lits = &self.ro.suffixes;
718 lits.find_end(&text[start..])
719 .map(|(s, e)| (start + s, start + e))
720 }
721 AhoCorasick => self
722 .ro
723 .ac
724 .as_ref()
725 .unwrap()
726 .find(&text[start..])
727 .map(|m| (start + m.start(), start + m.end())),
728 }
729 }
730
731 /// Finds the leftmost-first match (start and end) using only the DFA.
732 ///
733 /// If the result returned indicates that the DFA quit, then another
734 /// matching engine should be used.
735 #[cfg(feature = "perf-dfa")]
736 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>737 fn find_dfa_forward(
738 &self,
739 text: &[u8],
740 start: usize,
741 ) -> dfa::Result<(usize, usize)> {
742 use dfa::Result::*;
743 let end = match dfa::Fsm::forward(
744 &self.ro.dfa,
745 self.cache.value(),
746 false,
747 text,
748 start,
749 ) {
750 NoMatch(i) => return NoMatch(i),
751 Quit => return Quit,
752 Match(end) if start == end => return Match((start, start)),
753 Match(end) => end,
754 };
755 // Now run the DFA in reverse to find the start of the match.
756 match dfa::Fsm::reverse(
757 &self.ro.dfa_reverse,
758 self.cache.value(),
759 false,
760 &text[start..],
761 end - start,
762 ) {
763 Match(s) => Match((start + s, end)),
764 NoMatch(i) => NoMatch(i),
765 Quit => Quit,
766 }
767 }
768
769 /// Finds the leftmost-first match (start and end) using only the DFA,
770 /// but assumes the regex is anchored at the end and therefore starts at
771 /// the end of the regex and matches in reverse.
772 ///
773 /// If the result returned indicates that the DFA quit, then another
774 /// matching engine should be used.
775 #[cfg(feature = "perf-dfa")]
776 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>777 fn find_dfa_anchored_reverse(
778 &self,
779 text: &[u8],
780 start: usize,
781 ) -> dfa::Result<(usize, usize)> {
782 use dfa::Result::*;
783 match dfa::Fsm::reverse(
784 &self.ro.dfa_reverse,
785 self.cache.value(),
786 false,
787 &text[start..],
788 text.len() - start,
789 ) {
790 Match(s) => Match((start + s, text.len())),
791 NoMatch(i) => NoMatch(i),
792 Quit => Quit,
793 }
794 }
795
796 /// Finds the end of the shortest match using only the DFA.
797 #[cfg(feature = "perf-dfa")]
798 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>799 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
800 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
801 }
802
803 /// Finds the end of the shortest match using only the DFA by scanning for
804 /// suffix literals.
805 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
806 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>807 fn shortest_dfa_reverse_suffix(
808 &self,
809 text: &[u8],
810 start: usize,
811 ) -> dfa::Result<usize> {
812 match self.exec_dfa_reverse_suffix(text, start) {
813 None => self.shortest_dfa(text, start),
814 Some(r) => r.map(|(_, end)| end),
815 }
816 }
817
818 /// Finds the end of the shortest match using only the DFA by scanning for
819 /// suffix literals. It also reports the start of the match.
820 ///
821 /// Note that if None is returned, then the optimization gave up to avoid
822 /// worst case quadratic behavior. A forward scanning DFA should be tried
823 /// next.
824 ///
825 /// If a match is returned and the full leftmost-first match is desired,
826 /// then a forward scan starting from the beginning of the match must be
827 /// done.
828 ///
829 /// If the result returned indicates that the DFA quit, then another
830 /// matching engine should be used.
831 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
832 #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>833 fn exec_dfa_reverse_suffix(
834 &self,
835 text: &[u8],
836 original_start: usize,
837 ) -> Option<dfa::Result<(usize, usize)>> {
838 use dfa::Result::*;
839
840 let lcs = self.ro.suffixes.lcs();
841 debug_assert!(lcs.len() >= 1);
842 let mut start = original_start;
843 let mut end = start;
844 let mut last_literal = start;
845 while end <= text.len() {
846 last_literal += match lcs.find(&text[last_literal..]) {
847 None => return Some(NoMatch(text.len())),
848 Some(i) => i,
849 };
850 end = last_literal + lcs.len();
851 match dfa::Fsm::reverse(
852 &self.ro.dfa_reverse,
853 self.cache.value(),
854 false,
855 &text[start..end],
856 end - start,
857 ) {
858 Match(0) | NoMatch(0) => return None,
859 Match(i) => return Some(Match((start + i, end))),
860 NoMatch(i) => {
861 start += i;
862 last_literal += 1;
863 continue;
864 }
865 Quit => return Some(Quit),
866 };
867 }
868 Some(NoMatch(text.len()))
869 }
870
871 /// Finds the leftmost-first match (start and end) using only the DFA
872 /// by scanning for suffix literals.
873 ///
874 /// If the result returned indicates that the DFA quit, then another
875 /// matching engine should be used.
876 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
877 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>878 fn find_dfa_reverse_suffix(
879 &self,
880 text: &[u8],
881 start: usize,
882 ) -> dfa::Result<(usize, usize)> {
883 use dfa::Result::*;
884
885 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
886 None => return self.find_dfa_forward(text, start),
887 Some(Match((start, _))) => start,
888 Some(r) => return r,
889 };
890 // At this point, we've found a match. The only way to quit now
891 // without a match is if the DFA gives up (seems unlikely).
892 //
893 // Now run the DFA forwards to find the proper end of the match.
894 // (The suffix literal match can only indicate the earliest
895 // possible end location, which may appear before the end of the
896 // leftmost-first match.)
897 match dfa::Fsm::forward(
898 &self.ro.dfa,
899 self.cache.value(),
900 false,
901 text,
902 match_start,
903 ) {
904 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
905 Quit => Quit,
906 Match(e) => Match((match_start, e)),
907 }
908 }
909
910 /// Executes the NFA engine to return whether there is a match or not.
911 ///
912 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
913 /// performance characteristics, but regex sets don't have captures, which
914 /// shortest_nfa depends on.
915 #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool916 fn match_nfa(&self, text: &[u8], start: usize) -> bool {
917 self.match_nfa_type(MatchNfaType::Auto, text, start)
918 }
919
920 /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool921 fn match_nfa_type(
922 &self,
923 ty: MatchNfaType,
924 text: &[u8],
925 start: usize,
926 ) -> bool {
927 self.exec_nfa(
928 ty,
929 &mut [false],
930 &mut [],
931 true,
932 false,
933 text,
934 start,
935 text.len(),
936 )
937 }
938
939 /// Finds the shortest match using an NFA.
940 #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>941 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
942 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
943 }
944
945 /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>946 fn shortest_nfa_type(
947 &self,
948 ty: MatchNfaType,
949 text: &[u8],
950 start: usize,
951 ) -> Option<usize> {
952 let mut slots = [None, None];
953 if self.exec_nfa(
954 ty,
955 &mut [false],
956 &mut slots,
957 true,
958 true,
959 text,
960 start,
961 text.len(),
962 ) {
963 slots[1]
964 } else {
965 None
966 }
967 }
968
969 /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>970 fn find_nfa(
971 &self,
972 ty: MatchNfaType,
973 text: &[u8],
974 start: usize,
975 ) -> Option<(usize, usize)> {
976 let mut slots = [None, None];
977 if self.exec_nfa(
978 ty,
979 &mut [false],
980 &mut slots,
981 false,
982 false,
983 text,
984 start,
985 text.len(),
986 ) {
987 match (slots[0], slots[1]) {
988 (Some(s), Some(e)) => Some((s, e)),
989 _ => None,
990 }
991 } else {
992 None
993 }
994 }
995
996 /// Like find_nfa, but fills in captures.
997 ///
998 /// `slots` should have length equal to `2 * nfa.captures.len()`.
999 #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>1000 fn captures_nfa(
1001 &self,
1002 slots: &mut [Slot],
1003 text: &[u8],
1004 start: usize,
1005 ) -> Option<(usize, usize)> {
1006 self.captures_nfa_type(
1007 MatchNfaType::Auto,
1008 slots,
1009 text,
1010 start,
1011 text.len(),
1012 )
1013 }
1014
1015 /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1016 fn captures_nfa_type(
1017 &self,
1018 ty: MatchNfaType,
1019 slots: &mut [Slot],
1020 text: &[u8],
1021 start: usize,
1022 end: usize,
1023 ) -> Option<(usize, usize)> {
1024 if self.exec_nfa(
1025 ty,
1026 &mut [false],
1027 slots,
1028 false,
1029 false,
1030 text,
1031 start,
1032 end,
1033 ) {
1034 match (slots[0], slots[1]) {
1035 (Some(s), Some(e)) => Some((s, e)),
1036 _ => None,
1037 }
1038 } else {
1039 None
1040 }
1041 }
1042
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1043 fn exec_nfa(
1044 &self,
1045 mut ty: MatchNfaType,
1046 matches: &mut [bool],
1047 slots: &mut [Slot],
1048 quit_after_match: bool,
1049 quit_after_match_with_pos: bool,
1050 text: &[u8],
1051 start: usize,
1052 end: usize,
1053 ) -> bool {
1054 use self::MatchNfaType::*;
1055 if let Auto = ty {
1056 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1057 ty = Backtrack;
1058 } else {
1059 ty = PikeVM;
1060 }
1061 }
1062 // The backtracker can't return the shortest match position as it is
1063 // implemented today. So if someone calls `shortest_match` and we need
1064 // to run an NFA, then use the PikeVM.
1065 if quit_after_match_with_pos || ty == PikeVM {
1066 self.exec_pikevm(
1067 matches,
1068 slots,
1069 quit_after_match,
1070 text,
1071 start,
1072 end,
1073 )
1074 } else {
1075 self.exec_backtrack(matches, slots, text, start, end)
1076 }
1077 }
1078
1079 /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1080 fn exec_pikevm(
1081 &self,
1082 matches: &mut [bool],
1083 slots: &mut [Slot],
1084 quit_after_match: bool,
1085 text: &[u8],
1086 start: usize,
1087 end: usize,
1088 ) -> bool {
1089 if self.ro.nfa.uses_bytes() {
1090 pikevm::Fsm::exec(
1091 &self.ro.nfa,
1092 self.cache.value(),
1093 matches,
1094 slots,
1095 quit_after_match,
1096 ByteInput::new(text, self.ro.nfa.only_utf8),
1097 start,
1098 end,
1099 )
1100 } else {
1101 pikevm::Fsm::exec(
1102 &self.ro.nfa,
1103 self.cache.value(),
1104 matches,
1105 slots,
1106 quit_after_match,
1107 CharInput::new(text),
1108 start,
1109 end,
1110 )
1111 }
1112 }
1113
1114 /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1115 fn exec_backtrack(
1116 &self,
1117 matches: &mut [bool],
1118 slots: &mut [Slot],
1119 text: &[u8],
1120 start: usize,
1121 end: usize,
1122 ) -> bool {
1123 if self.ro.nfa.uses_bytes() {
1124 backtrack::Bounded::exec(
1125 &self.ro.nfa,
1126 self.cache.value(),
1127 matches,
1128 slots,
1129 ByteInput::new(text, self.ro.nfa.only_utf8),
1130 start,
1131 end,
1132 )
1133 } else {
1134 backtrack::Bounded::exec(
1135 &self.ro.nfa,
1136 self.cache.value(),
1137 matches,
1138 slots,
1139 CharInput::new(text),
1140 start,
1141 end,
1142 )
1143 }
1144 }
1145
1146 /// Finds which regular expressions match the given text.
1147 ///
1148 /// `matches` should have length equal to the number of regexes being
1149 /// searched.
1150 ///
1151 /// This is only useful when one wants to know which regexes in a set
1152 /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1153 pub fn many_matches_at(
1154 &self,
1155 matches: &mut [bool],
1156 text: &[u8],
1157 start: usize,
1158 ) -> bool {
1159 use self::MatchType::*;
1160 if !self.is_anchor_end_match(text) {
1161 return false;
1162 }
1163 match self.ro.match_type {
1164 #[cfg(feature = "perf-literal")]
1165 Literal(ty) => {
1166 debug_assert_eq!(matches.len(), 1);
1167 matches[0] = self.find_literals(ty, text, start).is_some();
1168 matches[0]
1169 }
1170 #[cfg(feature = "perf-dfa")]
1171 Dfa | DfaAnchoredReverse | DfaMany => {
1172 match dfa::Fsm::forward_many(
1173 &self.ro.dfa,
1174 self.cache.value(),
1175 matches,
1176 text,
1177 start,
1178 ) {
1179 dfa::Result::Match(_) => true,
1180 dfa::Result::NoMatch(_) => false,
1181 dfa::Result::Quit => self.exec_nfa(
1182 MatchNfaType::Auto,
1183 matches,
1184 &mut [],
1185 false,
1186 false,
1187 text,
1188 start,
1189 text.len(),
1190 ),
1191 }
1192 }
1193 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1194 DfaSuffix => {
1195 match dfa::Fsm::forward_many(
1196 &self.ro.dfa,
1197 self.cache.value(),
1198 matches,
1199 text,
1200 start,
1201 ) {
1202 dfa::Result::Match(_) => true,
1203 dfa::Result::NoMatch(_) => false,
1204 dfa::Result::Quit => self.exec_nfa(
1205 MatchNfaType::Auto,
1206 matches,
1207 &mut [],
1208 false,
1209 false,
1210 text,
1211 start,
1212 text.len(),
1213 ),
1214 }
1215 }
1216 Nfa(ty) => self.exec_nfa(
1217 ty,
1218 matches,
1219 &mut [],
1220 false,
1221 false,
1222 text,
1223 start,
1224 text.len(),
1225 ),
1226 Nothing => false,
1227 }
1228 }
1229
1230 #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1231 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1232 #[cfg(not(feature = "perf-literal"))]
1233 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1234 true
1235 }
1236
1237 #[cfg(feature = "perf-literal")]
1238 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1239 // Only do this check if the haystack is big (>1MB).
1240 if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1241 let lcs = ro.suffixes.lcs();
1242 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1243 return false;
1244 }
1245 }
1246 true
1247 }
1248
1249 imp(&self.ro, text)
1250 }
1251
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1252 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1253 &self.ro.nfa.capture_name_idx
1254 }
1255 }
1256
1257 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1258 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1259 self.0.capture_name_idx()
1260 }
1261 }
1262
1263 impl Exec {
1264 /// Get a searcher that isn't Sync.
1265 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1266 pub fn searcher(&self) -> ExecNoSync {
1267 ExecNoSync {
1268 ro: &self.ro, // a clone is too expensive here! (and not needed)
1269 cache: self.pool.get(),
1270 }
1271 }
1272
1273 /// Get a searcher that isn't Sync and can match on &str.
1274 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1275 pub fn searcher_str(&self) -> ExecNoSyncStr {
1276 ExecNoSyncStr(self.searcher())
1277 }
1278
1279 /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1280 pub fn into_regex(self) -> re_unicode::Regex {
1281 re_unicode::Regex::from(self)
1282 }
1283
1284 /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1285 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1286 re_set::unicode::RegexSet::from(self)
1287 }
1288
1289 /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1290 pub fn into_byte_regex(self) -> re_bytes::Regex {
1291 re_bytes::Regex::from(self)
1292 }
1293
1294 /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1295 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1296 re_set::bytes::RegexSet::from(self)
1297 }
1298
1299 /// The original regular expressions given by the caller that were
1300 /// compiled.
regex_strings(&self) -> &[String]1301 pub fn regex_strings(&self) -> &[String] {
1302 &self.ro.res
1303 }
1304
1305 /// Return a slice of capture names.
1306 ///
1307 /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1308 pub fn capture_names(&self) -> &[Option<String>] {
1309 &self.ro.nfa.captures
1310 }
1311
1312 /// Return a reference to named groups mapping (from group name to
1313 /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1314 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1315 &self.ro.nfa.capture_name_idx
1316 }
1317 }
1318
1319 impl Clone for Exec {
clone(&self) -> Exec1320 fn clone(&self) -> Exec {
1321 let pool = ExecReadOnly::new_pool(&self.ro);
1322 Exec { ro: self.ro.clone(), pool }
1323 }
1324 }
1325
1326 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1327 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1328 if let Some(MatchType::Nfa(_)) = hint {
1329 return hint.unwrap();
1330 }
1331 // If the NFA is empty, then we'll never match anything.
1332 if self.nfa.insts.is_empty() {
1333 return MatchType::Nothing;
1334 }
1335 if let Some(literalty) = self.choose_literal_match_type() {
1336 return literalty;
1337 }
1338 if let Some(dfaty) = self.choose_dfa_match_type() {
1339 return dfaty;
1340 }
1341 // We're so totally hosed.
1342 MatchType::Nfa(MatchNfaType::Auto)
1343 }
1344
1345 /// If a plain literal scan can be used, then a corresponding literal
1346 /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1347 fn choose_literal_match_type(&self) -> Option<MatchType> {
1348 #[cfg(not(feature = "perf-literal"))]
1349 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1350 None
1351 }
1352
1353 #[cfg(feature = "perf-literal")]
1354 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1355 // If our set of prefixes is complete, then we can use it to find
1356 // a match in lieu of a regex engine. This doesn't quite work well
1357 // in the presence of multiple regexes, so only do it when there's
1358 // one.
1359 //
1360 // TODO(burntsushi): Also, don't try to match literals if the regex
1361 // is partially anchored. We could technically do it, but we'd need
1362 // to create two sets of literals: all of them and then the subset
1363 // that aren't anchored. We would then only search for all of them
1364 // when at the beginning of the input and use the subset in all
1365 // other cases.
1366 if ro.res.len() != 1 {
1367 return None;
1368 }
1369 if ro.ac.is_some() {
1370 return Some(MatchType::Literal(
1371 MatchLiteralType::AhoCorasick,
1372 ));
1373 }
1374 if ro.nfa.prefixes.complete() {
1375 return if ro.nfa.is_anchored_start {
1376 Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1377 } else {
1378 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1379 };
1380 }
1381 if ro.suffixes.complete() {
1382 return if ro.nfa.is_anchored_end {
1383 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1384 } else {
1385 // This case shouldn't happen. When the regex isn't
1386 // anchored, then complete prefixes should imply complete
1387 // suffixes.
1388 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1389 };
1390 }
1391 None
1392 }
1393
1394 imp(self)
1395 }
1396
1397 /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1398 fn choose_dfa_match_type(&self) -> Option<MatchType> {
1399 #[cfg(not(feature = "perf-dfa"))]
1400 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1401 None
1402 }
1403
1404 #[cfg(feature = "perf-dfa")]
1405 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1406 if !dfa::can_exec(&ro.dfa) {
1407 return None;
1408 }
1409 // Regex sets require a slightly specialized path.
1410 if ro.res.len() >= 2 {
1411 return Some(MatchType::DfaMany);
1412 }
1413 // If the regex is anchored at the end but not the start, then
1414 // just match in reverse from the end of the haystack.
1415 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1416 return Some(MatchType::DfaAnchoredReverse);
1417 }
1418 #[cfg(feature = "perf-literal")]
1419 {
1420 // If there's a longish suffix literal, then it might be faster
1421 // to look for that first.
1422 if ro.should_suffix_scan() {
1423 return Some(MatchType::DfaSuffix);
1424 }
1425 }
1426 // Fall back to your garden variety forward searching lazy DFA.
1427 Some(MatchType::Dfa)
1428 }
1429
1430 imp(self)
1431 }
1432
1433 /// Returns true if the program is amenable to suffix scanning.
1434 ///
1435 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1436 /// for suffix literals and then do a *reverse* DFA match from any matches
1437 /// produced by the literal scan. (And then followed by a forward DFA
1438 /// search, since the previously found suffix literal maybe not actually be
1439 /// the end of a match.)
1440 ///
1441 /// This is a bit of a specialized optimization, but can result in pretty
1442 /// big performance wins if 1) there are no prefix literals and 2) the
1443 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1444 /// account for but (2) is harder. As a proxy, we assume that longer
1445 /// strings are generally rarer, so we only enable this optimization when
1446 /// we have a meaty suffix.
1447 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1448 fn should_suffix_scan(&self) -> bool {
1449 if self.suffixes.is_empty() {
1450 return false;
1451 }
1452 let lcs_len = self.suffixes.lcs().char_len();
1453 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1454 }
1455
new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>>1456 fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1457 let ro = ro.clone();
1458 Box::new(Pool::new(Box::new(move || {
1459 AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1460 })))
1461 }
1462 }
1463
1464 #[derive(Clone, Copy, Debug)]
1465 enum MatchType {
1466 /// A single or multiple literal search. This is only used when the regex
1467 /// can be decomposed into a literal search.
1468 #[cfg(feature = "perf-literal")]
1469 Literal(MatchLiteralType),
1470 /// A normal DFA search.
1471 #[cfg(feature = "perf-dfa")]
1472 Dfa,
1473 /// A reverse DFA search starting from the end of a haystack.
1474 #[cfg(feature = "perf-dfa")]
1475 DfaAnchoredReverse,
1476 /// A reverse DFA search with suffix literal scanning.
1477 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1478 DfaSuffix,
1479 /// Use the DFA on two or more regular expressions.
1480 #[cfg(feature = "perf-dfa")]
1481 DfaMany,
1482 /// An NFA variant.
1483 Nfa(MatchNfaType),
1484 /// No match is ever possible, so don't ever try to search.
1485 Nothing,
1486 }
1487
1488 #[derive(Clone, Copy, Debug)]
1489 #[cfg(feature = "perf-literal")]
1490 enum MatchLiteralType {
1491 /// Match literals anywhere in text.
1492 Unanchored,
1493 /// Match literals only at the start of text.
1494 AnchoredStart,
1495 /// Match literals only at the end of text.
1496 AnchoredEnd,
1497 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1498 /// ExecReadOnly.
1499 AhoCorasick,
1500 }
1501
1502 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1503 enum MatchNfaType {
1504 /// Choose between Backtrack and PikeVM.
1505 Auto,
1506 /// NFA bounded backtracking.
1507 ///
1508 /// (This is only set by tests, since it never makes sense to always want
1509 /// backtracking.)
1510 Backtrack,
1511 /// The Pike VM.
1512 ///
1513 /// (This is only set by tests, since it never makes sense to always want
1514 /// the Pike VM.)
1515 PikeVM,
1516 }
1517
1518 /// `ProgramCache` maintains reusable allocations for each matching engine
1519 /// available to a particular program.
1520 ///
1521 /// We declare this as unwind safe since it's a cache that's only used for
1522 /// performance purposes. If a panic occurs, it is (or should be) always safe
1523 /// to continue using the same regex object.
1524 pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1525
1526 #[derive(Debug)]
1527 pub struct ProgramCacheInner {
1528 pub pikevm: pikevm::Cache,
1529 pub backtrack: backtrack::Cache,
1530 #[cfg(feature = "perf-dfa")]
1531 pub dfa: dfa::Cache,
1532 #[cfg(feature = "perf-dfa")]
1533 pub dfa_reverse: dfa::Cache,
1534 }
1535
1536 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1537 fn new(ro: &ExecReadOnly) -> Self {
1538 ProgramCacheInner {
1539 pikevm: pikevm::Cache::new(&ro.nfa),
1540 backtrack: backtrack::Cache::new(&ro.nfa),
1541 #[cfg(feature = "perf-dfa")]
1542 dfa: dfa::Cache::new(&ro.dfa),
1543 #[cfg(feature = "perf-dfa")]
1544 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1545 }
1546 }
1547 }
1548
1549 /// Alternation literals checks if the given HIR is a simple alternation of
1550 /// literals, and if so, returns them. Otherwise, this returns None.
1551 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1552 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1553 use syntax::hir::{HirKind, Literal};
1554
1555 // This is pretty hacky, but basically, if `is_alternation_literal` is
1556 // true, then we can make several assumptions about the structure of our
1557 // HIR. This is what justifies the `unreachable!` statements below.
1558 //
1559 // This code should be refactored once we overhaul this crate's
1560 // optimization pipeline, because this is a terribly inflexible way to go
1561 // about things.
1562
1563 if !expr.is_alternation_literal() {
1564 return None;
1565 }
1566 let alts = match *expr.kind() {
1567 HirKind::Alternation(ref alts) => alts,
1568 _ => return None, // one literal isn't worth it
1569 };
1570
1571 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1572 Literal::Unicode(c) => {
1573 let mut buf = [0; 4];
1574 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1575 }
1576 Literal::Byte(b) => {
1577 dst.push(b);
1578 }
1579 };
1580
1581 let mut lits = vec![];
1582 for alt in alts {
1583 let mut lit = vec![];
1584 match *alt.kind() {
1585 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1586 HirKind::Concat(ref exprs) => {
1587 for e in exprs {
1588 match *e.kind() {
1589 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1590 _ => unreachable!("expected literal, got {:?}", e),
1591 }
1592 }
1593 }
1594 _ => unreachable!("expected literal or concat, got {:?}", alt),
1595 }
1596 lits.push(lit);
1597 }
1598 Some(lits)
1599 }
1600
1601 #[cfg(test)]
1602 mod test {
1603 #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1604 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1605 use internal::ExecBuilder;
1606
1607 let backtrack_bytes_re = ExecBuilder::new("^S")
1608 .bounded_backtracking()
1609 .only_utf8(false)
1610 .build()
1611 .map(|exec| exec.into_byte_regex())
1612 .map_err(|err| format!("{}", err))
1613 .unwrap();
1614
1615 let default_bytes_re = ExecBuilder::new("^S")
1616 .only_utf8(false)
1617 .build()
1618 .map(|exec| exec.into_byte_regex())
1619 .map_err(|err| format!("{}", err))
1620 .unwrap();
1621
1622 let input = vec![83, 83];
1623
1624 let s1 = backtrack_bytes_re.split(&input);
1625 let s2 = default_bytes_re.split(&input);
1626 for (chunk1, chunk2) in s1.zip(s2) {
1627 assert_eq!(chunk1, chunk2);
1628 }
1629 }
1630
1631 #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1632 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1633 use internal::ExecBuilder;
1634
1635 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1636 .bounded_backtracking()
1637 .bytes(true)
1638 .build()
1639 .map(|exec| exec.into_regex())
1640 .map_err(|err| format!("{}", err))
1641 .unwrap();
1642
1643 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1644 .bytes(true)
1645 .build()
1646 .map(|exec| exec.into_regex())
1647 .map_err(|err| format!("{}", err))
1648 .unwrap();
1649
1650 let input = "**";
1651
1652 let s1 = backtrack_bytes_re.split(input);
1653 let s2 = default_bytes_re.split(input);
1654 for (chunk1, chunk2) in s1.zip(s2) {
1655 assert_eq!(chunk1, chunk2);
1656 }
1657 }
1658 }
1659