1 use std::mem;
2
3 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
4 use memchr::{memchr, memchr2, memchr3, memmem};
5 use regex_syntax::hir::literal::{Literal, Literals};
6
7 /// A prefix extracted from a compiled regular expression.
8 ///
9 /// A regex prefix is a set of literal strings that *must* be matched at the
10 /// beginning of a regex in order for the entire regex to match. Similarly
11 /// for a regex suffix.
12 #[derive(Clone, Debug)]
13 pub struct LiteralSearcher {
14 complete: bool,
15 lcp: Memmem,
16 lcs: Memmem,
17 matcher: Matcher,
18 }
19
20 #[derive(Clone, Debug)]
21 enum Matcher {
22 /// No literals. (Never advances through the input.)
23 Empty,
24 /// A set of four or more single byte literals.
25 Bytes(SingleByteSet),
26 /// A single substring, using vector accelerated routines when available.
27 Memmem(Memmem),
28 /// An Aho-Corasick automaton.
29 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
30 /// A packed multiple substring searcher, using SIMD.
31 ///
32 /// Note that Aho-Corasick will actually use this packed searcher
33 /// internally automatically, however, there is some overhead associated
34 /// with going through the Aho-Corasick machinery. So using the packed
35 /// searcher directly results in some gains.
36 Packed { s: packed::Searcher, lits: Vec<Literal> },
37 }
38
39 impl LiteralSearcher {
40 /// Returns a matcher that never matches and never advances the input.
empty() -> Self41 pub fn empty() -> Self {
42 Self::new(Literals::empty(), Matcher::Empty)
43 }
44
45 /// Returns a matcher for literal prefixes from the given set.
prefixes(lits: Literals) -> Self46 pub fn prefixes(lits: Literals) -> Self {
47 let matcher = Matcher::prefixes(&lits);
48 Self::new(lits, matcher)
49 }
50
51 /// Returns a matcher for literal suffixes from the given set.
suffixes(lits: Literals) -> Self52 pub fn suffixes(lits: Literals) -> Self {
53 let matcher = Matcher::suffixes(&lits);
54 Self::new(lits, matcher)
55 }
56
new(lits: Literals, matcher: Matcher) -> Self57 fn new(lits: Literals, matcher: Matcher) -> Self {
58 let complete = lits.all_complete();
59 LiteralSearcher {
60 complete,
61 lcp: Memmem::new(lits.longest_common_prefix()),
62 lcs: Memmem::new(lits.longest_common_suffix()),
63 matcher,
64 }
65 }
66
67 /// Returns true if all matches comprise the entire regular expression.
68 ///
69 /// This does not necessarily mean that a literal match implies a match
70 /// of the regular expression. For example, the regular expression `^a`
71 /// is comprised of a single complete literal `a`, but the regular
72 /// expression demands that it only match at the beginning of a string.
complete(&self) -> bool73 pub fn complete(&self) -> bool {
74 self.complete && !self.is_empty()
75 }
76
77 /// Find the position of a literal in `haystack` if it exists.
78 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<(usize, usize)>79 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
80 use self::Matcher::*;
81 match self.matcher {
82 Empty => Some((0, 0)),
83 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
84 Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
85 AC { ref ac, .. } => {
86 ac.find(haystack).map(|m| (m.start(), m.end()))
87 }
88 Packed { ref s, .. } => {
89 s.find(haystack).map(|m| (m.start(), m.end()))
90 }
91 }
92 }
93
94 /// Like find, except matches must start at index `0`.
find_start(&self, haystack: &[u8]) -> Option<(usize, usize)>95 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
96 for lit in self.iter() {
97 if lit.len() > haystack.len() {
98 continue;
99 }
100 if lit == &haystack[0..lit.len()] {
101 return Some((0, lit.len()));
102 }
103 }
104 None
105 }
106
107 /// Like find, except matches must end at index `haystack.len()`.
find_end(&self, haystack: &[u8]) -> Option<(usize, usize)>108 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
109 for lit in self.iter() {
110 if lit.len() > haystack.len() {
111 continue;
112 }
113 if lit == &haystack[haystack.len() - lit.len()..] {
114 return Some((haystack.len() - lit.len(), haystack.len()));
115 }
116 }
117 None
118 }
119
120 /// Returns an iterator over all literals to be matched.
iter(&self) -> LiteralIter<'_>121 pub fn iter(&self) -> LiteralIter<'_> {
122 match self.matcher {
123 Matcher::Empty => LiteralIter::Empty,
124 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
125 Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
126 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
127 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
128 }
129 }
130
131 /// Returns a matcher for the longest common prefix of this matcher.
lcp(&self) -> &Memmem132 pub fn lcp(&self) -> &Memmem {
133 &self.lcp
134 }
135
136 /// Returns a matcher for the longest common suffix of this matcher.
lcs(&self) -> &Memmem137 pub fn lcs(&self) -> &Memmem {
138 &self.lcs
139 }
140
141 /// Returns true iff this prefix is empty.
is_empty(&self) -> bool142 pub fn is_empty(&self) -> bool {
143 self.len() == 0
144 }
145
146 /// Returns the number of prefixes in this machine.
len(&self) -> usize147 pub fn len(&self) -> usize {
148 use self::Matcher::*;
149 match self.matcher {
150 Empty => 0,
151 Bytes(ref sset) => sset.dense.len(),
152 Memmem(_) => 1,
153 AC { ref ac, .. } => ac.pattern_count(),
154 Packed { ref lits, .. } => lits.len(),
155 }
156 }
157
158 /// Return the approximate heap usage of literals in bytes.
approximate_size(&self) -> usize159 pub fn approximate_size(&self) -> usize {
160 use self::Matcher::*;
161 match self.matcher {
162 Empty => 0,
163 Bytes(ref sset) => sset.approximate_size(),
164 Memmem(ref single) => single.approximate_size(),
165 AC { ref ac, .. } => ac.heap_bytes(),
166 Packed { ref s, .. } => s.heap_bytes(),
167 }
168 }
169 }
170
171 impl Matcher {
prefixes(lits: &Literals) -> Self172 fn prefixes(lits: &Literals) -> Self {
173 let sset = SingleByteSet::prefixes(lits);
174 Matcher::new(lits, sset)
175 }
176
suffixes(lits: &Literals) -> Self177 fn suffixes(lits: &Literals) -> Self {
178 let sset = SingleByteSet::suffixes(lits);
179 Matcher::new(lits, sset)
180 }
181
new(lits: &Literals, sset: SingleByteSet) -> Self182 fn new(lits: &Literals, sset: SingleByteSet) -> Self {
183 if lits.literals().is_empty() {
184 return Matcher::Empty;
185 }
186 if sset.dense.len() >= 26 {
187 // Avoid trying to match a large number of single bytes.
188 // This is *very* sensitive to a frequency analysis comparison
189 // between the bytes in sset and the composition of the haystack.
190 // No matter the size of sset, if its members all are rare in the
191 // haystack, then it'd be worth using it. How to tune this... IDK.
192 // ---AG
193 return Matcher::Empty;
194 }
195 if sset.complete {
196 return Matcher::Bytes(sset);
197 }
198 if lits.literals().len() == 1 {
199 return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
200 }
201
202 let pats = lits.literals().to_owned();
203 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
204 if lits.literals().len() <= 100 && !is_aho_corasick_fast {
205 let mut builder = packed::Config::new()
206 .match_kind(packed::MatchKind::LeftmostFirst)
207 .builder();
208 if let Some(s) = builder.extend(&pats).build() {
209 return Matcher::Packed { s, lits: pats };
210 }
211 }
212 let ac = AhoCorasickBuilder::new()
213 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
214 .dfa(true)
215 .build_with_size::<u32, _, _>(&pats)
216 .unwrap();
217 Matcher::AC { ac, lits: pats }
218 }
219 }
220
221 #[derive(Debug)]
222 pub enum LiteralIter<'a> {
223 Empty,
224 Bytes(&'a [u8]),
225 Single(&'a [u8]),
226 AC(&'a [Literal]),
227 Packed(&'a [Literal]),
228 }
229
230 impl<'a> Iterator for LiteralIter<'a> {
231 type Item = &'a [u8];
232
next(&mut self) -> Option<Self::Item>233 fn next(&mut self) -> Option<Self::Item> {
234 match *self {
235 LiteralIter::Empty => None,
236 LiteralIter::Bytes(ref mut many) => {
237 if many.is_empty() {
238 None
239 } else {
240 let next = &many[0..1];
241 *many = &many[1..];
242 Some(next)
243 }
244 }
245 LiteralIter::Single(ref mut one) => {
246 if one.is_empty() {
247 None
248 } else {
249 let next = &one[..];
250 *one = &[];
251 Some(next)
252 }
253 }
254 LiteralIter::AC(ref mut lits) => {
255 if lits.is_empty() {
256 None
257 } else {
258 let next = &lits[0];
259 *lits = &lits[1..];
260 Some(&**next)
261 }
262 }
263 LiteralIter::Packed(ref mut lits) => {
264 if lits.is_empty() {
265 None
266 } else {
267 let next = &lits[0];
268 *lits = &lits[1..];
269 Some(&**next)
270 }
271 }
272 }
273 }
274 }
275
276 #[derive(Clone, Debug)]
277 struct SingleByteSet {
278 sparse: Vec<bool>,
279 dense: Vec<u8>,
280 complete: bool,
281 all_ascii: bool,
282 }
283
284 impl SingleByteSet {
new() -> SingleByteSet285 fn new() -> SingleByteSet {
286 SingleByteSet {
287 sparse: vec![false; 256],
288 dense: vec![],
289 complete: true,
290 all_ascii: true,
291 }
292 }
293
prefixes(lits: &Literals) -> SingleByteSet294 fn prefixes(lits: &Literals) -> SingleByteSet {
295 let mut sset = SingleByteSet::new();
296 for lit in lits.literals() {
297 sset.complete = sset.complete && lit.len() == 1;
298 if let Some(&b) = lit.get(0) {
299 if !sset.sparse[b as usize] {
300 if b > 0x7F {
301 sset.all_ascii = false;
302 }
303 sset.dense.push(b);
304 sset.sparse[b as usize] = true;
305 }
306 }
307 }
308 sset
309 }
310
suffixes(lits: &Literals) -> SingleByteSet311 fn suffixes(lits: &Literals) -> SingleByteSet {
312 let mut sset = SingleByteSet::new();
313 for lit in lits.literals() {
314 sset.complete = sset.complete && lit.len() == 1;
315 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
316 if !sset.sparse[b as usize] {
317 if b > 0x7F {
318 sset.all_ascii = false;
319 }
320 sset.dense.push(b);
321 sset.sparse[b as usize] = true;
322 }
323 }
324 }
325 sset
326 }
327
328 /// Faster find that special cases certain sizes to use memchr.
329 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, text: &[u8]) -> Option<usize>330 fn find(&self, text: &[u8]) -> Option<usize> {
331 match self.dense.len() {
332 0 => None,
333 1 => memchr(self.dense[0], text),
334 2 => memchr2(self.dense[0], self.dense[1], text),
335 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
336 _ => self._find(text),
337 }
338 }
339
340 /// Generic find that works on any sized set.
_find(&self, haystack: &[u8]) -> Option<usize>341 fn _find(&self, haystack: &[u8]) -> Option<usize> {
342 for (i, &b) in haystack.iter().enumerate() {
343 if self.sparse[b as usize] {
344 return Some(i);
345 }
346 }
347 None
348 }
349
approximate_size(&self) -> usize350 fn approximate_size(&self) -> usize {
351 (self.dense.len() * mem::size_of::<u8>())
352 + (self.sparse.len() * mem::size_of::<bool>())
353 }
354 }
355
356 /// A simple wrapper around the memchr crate's memmem implementation.
357 ///
358 /// The API this exposes mirrors the API of previous substring searchers that
359 /// this supplanted.
360 #[derive(Clone, Debug)]
361 pub struct Memmem {
362 finder: memmem::Finder<'static>,
363 char_len: usize,
364 }
365
366 impl Memmem {
new(pat: &[u8]) -> Memmem367 fn new(pat: &[u8]) -> Memmem {
368 Memmem {
369 finder: memmem::Finder::new(pat).into_owned(),
370 char_len: char_len_lossy(pat),
371 }
372 }
373
374 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<usize>375 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
376 self.finder.find(haystack)
377 }
378
379 #[cfg_attr(feature = "perf-inline", inline(always))]
is_suffix(&self, text: &[u8]) -> bool380 pub fn is_suffix(&self, text: &[u8]) -> bool {
381 if text.len() < self.len() {
382 return false;
383 }
384 &text[text.len() - self.len()..] == self.finder.needle()
385 }
386
len(&self) -> usize387 pub fn len(&self) -> usize {
388 self.finder.needle().len()
389 }
390
char_len(&self) -> usize391 pub fn char_len(&self) -> usize {
392 self.char_len
393 }
394
approximate_size(&self) -> usize395 fn approximate_size(&self) -> usize {
396 self.finder.needle().len() * mem::size_of::<u8>()
397 }
398 }
399
char_len_lossy(bytes: &[u8]) -> usize400 fn char_len_lossy(bytes: &[u8]) -> usize {
401 String::from_utf8_lossy(bytes).chars().count()
402 }
403