• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::mem;
2 
3 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
4 use memchr::{memchr, memchr2, memchr3, memmem};
5 use regex_syntax::hir::literal::{Literal, Literals};
6 
7 /// A prefix extracted from a compiled regular expression.
8 ///
9 /// A regex prefix is a set of literal strings that *must* be matched at the
10 /// beginning of a regex in order for the entire regex to match. Similarly
11 /// for a regex suffix.
12 #[derive(Clone, Debug)]
13 pub struct LiteralSearcher {
14     complete: bool,
15     lcp: Memmem,
16     lcs: Memmem,
17     matcher: Matcher,
18 }
19 
20 #[derive(Clone, Debug)]
21 enum Matcher {
22     /// No literals. (Never advances through the input.)
23     Empty,
24     /// A set of four or more single byte literals.
25     Bytes(SingleByteSet),
26     /// A single substring, using vector accelerated routines when available.
27     Memmem(Memmem),
28     /// An Aho-Corasick automaton.
29     AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
30     /// A packed multiple substring searcher, using SIMD.
31     ///
32     /// Note that Aho-Corasick will actually use this packed searcher
33     /// internally automatically, however, there is some overhead associated
34     /// with going through the Aho-Corasick machinery. So using the packed
35     /// searcher directly results in some gains.
36     Packed { s: packed::Searcher, lits: Vec<Literal> },
37 }
38 
39 impl LiteralSearcher {
40     /// Returns a matcher that never matches and never advances the input.
empty() -> Self41     pub fn empty() -> Self {
42         Self::new(Literals::empty(), Matcher::Empty)
43     }
44 
45     /// Returns a matcher for literal prefixes from the given set.
prefixes(lits: Literals) -> Self46     pub fn prefixes(lits: Literals) -> Self {
47         let matcher = Matcher::prefixes(&lits);
48         Self::new(lits, matcher)
49     }
50 
51     /// Returns a matcher for literal suffixes from the given set.
suffixes(lits: Literals) -> Self52     pub fn suffixes(lits: Literals) -> Self {
53         let matcher = Matcher::suffixes(&lits);
54         Self::new(lits, matcher)
55     }
56 
new(lits: Literals, matcher: Matcher) -> Self57     fn new(lits: Literals, matcher: Matcher) -> Self {
58         let complete = lits.all_complete();
59         LiteralSearcher {
60             complete,
61             lcp: Memmem::new(lits.longest_common_prefix()),
62             lcs: Memmem::new(lits.longest_common_suffix()),
63             matcher,
64         }
65     }
66 
67     /// Returns true if all matches comprise the entire regular expression.
68     ///
69     /// This does not necessarily mean that a literal match implies a match
70     /// of the regular expression. For example, the regular expression `^a`
71     /// is comprised of a single complete literal `a`, but the regular
72     /// expression demands that it only match at the beginning of a string.
complete(&self) -> bool73     pub fn complete(&self) -> bool {
74         self.complete && !self.is_empty()
75     }
76 
77     /// Find the position of a literal in `haystack` if it exists.
78     #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<(usize, usize)>79     pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
80         use self::Matcher::*;
81         match self.matcher {
82             Empty => Some((0, 0)),
83             Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
84             Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
85             AC { ref ac, .. } => {
86                 ac.find(haystack).map(|m| (m.start(), m.end()))
87             }
88             Packed { ref s, .. } => {
89                 s.find(haystack).map(|m| (m.start(), m.end()))
90             }
91         }
92     }
93 
94     /// Like find, except matches must start at index `0`.
find_start(&self, haystack: &[u8]) -> Option<(usize, usize)>95     pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
96         for lit in self.iter() {
97             if lit.len() > haystack.len() {
98                 continue;
99             }
100             if lit == &haystack[0..lit.len()] {
101                 return Some((0, lit.len()));
102             }
103         }
104         None
105     }
106 
107     /// Like find, except matches must end at index `haystack.len()`.
find_end(&self, haystack: &[u8]) -> Option<(usize, usize)>108     pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
109         for lit in self.iter() {
110             if lit.len() > haystack.len() {
111                 continue;
112             }
113             if lit == &haystack[haystack.len() - lit.len()..] {
114                 return Some((haystack.len() - lit.len(), haystack.len()));
115             }
116         }
117         None
118     }
119 
120     /// Returns an iterator over all literals to be matched.
iter(&self) -> LiteralIter<'_>121     pub fn iter(&self) -> LiteralIter<'_> {
122         match self.matcher {
123             Matcher::Empty => LiteralIter::Empty,
124             Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
125             Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
126             Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
127             Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
128         }
129     }
130 
131     /// Returns a matcher for the longest common prefix of this matcher.
lcp(&self) -> &Memmem132     pub fn lcp(&self) -> &Memmem {
133         &self.lcp
134     }
135 
136     /// Returns a matcher for the longest common suffix of this matcher.
lcs(&self) -> &Memmem137     pub fn lcs(&self) -> &Memmem {
138         &self.lcs
139     }
140 
141     /// Returns true iff this prefix is empty.
is_empty(&self) -> bool142     pub fn is_empty(&self) -> bool {
143         self.len() == 0
144     }
145 
146     /// Returns the number of prefixes in this machine.
len(&self) -> usize147     pub fn len(&self) -> usize {
148         use self::Matcher::*;
149         match self.matcher {
150             Empty => 0,
151             Bytes(ref sset) => sset.dense.len(),
152             Memmem(_) => 1,
153             AC { ref ac, .. } => ac.pattern_count(),
154             Packed { ref lits, .. } => lits.len(),
155         }
156     }
157 
158     /// Return the approximate heap usage of literals in bytes.
approximate_size(&self) -> usize159     pub fn approximate_size(&self) -> usize {
160         use self::Matcher::*;
161         match self.matcher {
162             Empty => 0,
163             Bytes(ref sset) => sset.approximate_size(),
164             Memmem(ref single) => single.approximate_size(),
165             AC { ref ac, .. } => ac.heap_bytes(),
166             Packed { ref s, .. } => s.heap_bytes(),
167         }
168     }
169 }
170 
171 impl Matcher {
prefixes(lits: &Literals) -> Self172     fn prefixes(lits: &Literals) -> Self {
173         let sset = SingleByteSet::prefixes(lits);
174         Matcher::new(lits, sset)
175     }
176 
suffixes(lits: &Literals) -> Self177     fn suffixes(lits: &Literals) -> Self {
178         let sset = SingleByteSet::suffixes(lits);
179         Matcher::new(lits, sset)
180     }
181 
new(lits: &Literals, sset: SingleByteSet) -> Self182     fn new(lits: &Literals, sset: SingleByteSet) -> Self {
183         if lits.literals().is_empty() {
184             return Matcher::Empty;
185         }
186         if sset.dense.len() >= 26 {
187             // Avoid trying to match a large number of single bytes.
188             // This is *very* sensitive to a frequency analysis comparison
189             // between the bytes in sset and the composition of the haystack.
190             // No matter the size of sset, if its members all are rare in the
191             // haystack, then it'd be worth using it. How to tune this... IDK.
192             // ---AG
193             return Matcher::Empty;
194         }
195         if sset.complete {
196             return Matcher::Bytes(sset);
197         }
198         if lits.literals().len() == 1 {
199             return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
200         }
201 
202         let pats = lits.literals().to_owned();
203         let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
204         if lits.literals().len() <= 100 && !is_aho_corasick_fast {
205             let mut builder = packed::Config::new()
206                 .match_kind(packed::MatchKind::LeftmostFirst)
207                 .builder();
208             if let Some(s) = builder.extend(&pats).build() {
209                 return Matcher::Packed { s, lits: pats };
210             }
211         }
212         let ac = AhoCorasickBuilder::new()
213             .match_kind(aho_corasick::MatchKind::LeftmostFirst)
214             .dfa(true)
215             .build_with_size::<u32, _, _>(&pats)
216             .unwrap();
217         Matcher::AC { ac, lits: pats }
218     }
219 }
220 
221 #[derive(Debug)]
222 pub enum LiteralIter<'a> {
223     Empty,
224     Bytes(&'a [u8]),
225     Single(&'a [u8]),
226     AC(&'a [Literal]),
227     Packed(&'a [Literal]),
228 }
229 
230 impl<'a> Iterator for LiteralIter<'a> {
231     type Item = &'a [u8];
232 
next(&mut self) -> Option<Self::Item>233     fn next(&mut self) -> Option<Self::Item> {
234         match *self {
235             LiteralIter::Empty => None,
236             LiteralIter::Bytes(ref mut many) => {
237                 if many.is_empty() {
238                     None
239                 } else {
240                     let next = &many[0..1];
241                     *many = &many[1..];
242                     Some(next)
243                 }
244             }
245             LiteralIter::Single(ref mut one) => {
246                 if one.is_empty() {
247                     None
248                 } else {
249                     let next = &one[..];
250                     *one = &[];
251                     Some(next)
252                 }
253             }
254             LiteralIter::AC(ref mut lits) => {
255                 if lits.is_empty() {
256                     None
257                 } else {
258                     let next = &lits[0];
259                     *lits = &lits[1..];
260                     Some(&**next)
261                 }
262             }
263             LiteralIter::Packed(ref mut lits) => {
264                 if lits.is_empty() {
265                     None
266                 } else {
267                     let next = &lits[0];
268                     *lits = &lits[1..];
269                     Some(&**next)
270                 }
271             }
272         }
273     }
274 }
275 
276 #[derive(Clone, Debug)]
277 struct SingleByteSet {
278     sparse: Vec<bool>,
279     dense: Vec<u8>,
280     complete: bool,
281     all_ascii: bool,
282 }
283 
284 impl SingleByteSet {
new() -> SingleByteSet285     fn new() -> SingleByteSet {
286         SingleByteSet {
287             sparse: vec![false; 256],
288             dense: vec![],
289             complete: true,
290             all_ascii: true,
291         }
292     }
293 
prefixes(lits: &Literals) -> SingleByteSet294     fn prefixes(lits: &Literals) -> SingleByteSet {
295         let mut sset = SingleByteSet::new();
296         for lit in lits.literals() {
297             sset.complete = sset.complete && lit.len() == 1;
298             if let Some(&b) = lit.get(0) {
299                 if !sset.sparse[b as usize] {
300                     if b > 0x7F {
301                         sset.all_ascii = false;
302                     }
303                     sset.dense.push(b);
304                     sset.sparse[b as usize] = true;
305                 }
306             }
307         }
308         sset
309     }
310 
suffixes(lits: &Literals) -> SingleByteSet311     fn suffixes(lits: &Literals) -> SingleByteSet {
312         let mut sset = SingleByteSet::new();
313         for lit in lits.literals() {
314             sset.complete = sset.complete && lit.len() == 1;
315             if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
316                 if !sset.sparse[b as usize] {
317                     if b > 0x7F {
318                         sset.all_ascii = false;
319                     }
320                     sset.dense.push(b);
321                     sset.sparse[b as usize] = true;
322                 }
323             }
324         }
325         sset
326     }
327 
328     /// Faster find that special cases certain sizes to use memchr.
329     #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, text: &[u8]) -> Option<usize>330     fn find(&self, text: &[u8]) -> Option<usize> {
331         match self.dense.len() {
332             0 => None,
333             1 => memchr(self.dense[0], text),
334             2 => memchr2(self.dense[0], self.dense[1], text),
335             3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
336             _ => self._find(text),
337         }
338     }
339 
340     /// Generic find that works on any sized set.
_find(&self, haystack: &[u8]) -> Option<usize>341     fn _find(&self, haystack: &[u8]) -> Option<usize> {
342         for (i, &b) in haystack.iter().enumerate() {
343             if self.sparse[b as usize] {
344                 return Some(i);
345             }
346         }
347         None
348     }
349 
approximate_size(&self) -> usize350     fn approximate_size(&self) -> usize {
351         (self.dense.len() * mem::size_of::<u8>())
352             + (self.sparse.len() * mem::size_of::<bool>())
353     }
354 }
355 
356 /// A simple wrapper around the memchr crate's memmem implementation.
357 ///
358 /// The API this exposes mirrors the API of previous substring searchers that
359 /// this supplanted.
360 #[derive(Clone, Debug)]
361 pub struct Memmem {
362     finder: memmem::Finder<'static>,
363     char_len: usize,
364 }
365 
366 impl Memmem {
new(pat: &[u8]) -> Memmem367     fn new(pat: &[u8]) -> Memmem {
368         Memmem {
369             finder: memmem::Finder::new(pat).into_owned(),
370             char_len: char_len_lossy(pat),
371         }
372     }
373 
374     #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<usize>375     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
376         self.finder.find(haystack)
377     }
378 
379     #[cfg_attr(feature = "perf-inline", inline(always))]
is_suffix(&self, text: &[u8]) -> bool380     pub fn is_suffix(&self, text: &[u8]) -> bool {
381         if text.len() < self.len() {
382             return false;
383         }
384         &text[text.len() - self.len()..] == self.finder.needle()
385     }
386 
len(&self) -> usize387     pub fn len(&self) -> usize {
388         self.finder.needle().len()
389     }
390 
char_len(&self) -> usize391     pub fn char_len(&self) -> usize {
392         self.char_len
393     }
394 
approximate_size(&self) -> usize395     fn approximate_size(&self) -> usize {
396         self.finder.needle().len() * mem::size_of::<u8>()
397     }
398 }
399 
char_len_lossy(bytes: &[u8]) -> usize400 fn char_len_lossy(bytes: &[u8]) -> usize {
401     String::from_utf8_lossy(bytes).chars().count()
402 }
403