• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::cmp;
2 use std::fmt;
3 use std::mem;
4 use std::u16;
5 use std::usize;
6 
7 use crate::packed::api::MatchKind;
8 
9 /// The type used for representing a pattern identifier.
10 ///
11 /// We don't use `usize` here because our packed searchers don't scale to
12 /// huge numbers of patterns, so we keep things a bit smaller.
13 pub type PatternID = u16;
14 
15 /// A non-empty collection of non-empty patterns to search for.
16 ///
17 /// This collection of patterns is what is passed around to both execute
18 /// searches and to construct the searchers themselves. Namely, this permits
19 /// searches to avoid copying all of the patterns, and allows us to keep only
20 /// one copy throughout all packed searchers.
21 ///
22 /// Note that this collection is not a set. The same pattern can appear more
23 /// than once.
24 #[derive(Clone, Debug)]
25 pub struct Patterns {
26     /// The match semantics supported by this collection of patterns.
27     ///
28     /// The match semantics determines the order of the iterator over patterns.
29     /// For leftmost-first, patterns are provided in the same order as were
30     /// provided by the caller. For leftmost-longest, patterns are provided in
31     /// descending order of length, with ties broken by the order in which they
32     /// were provided by the caller.
33     kind: MatchKind,
34     /// The collection of patterns, indexed by their identifier.
35     by_id: Vec<Vec<u8>>,
36     /// The order of patterns defined for iteration, given by pattern
37     /// identifiers. The order of `by_id` and `order` is always the same for
38     /// leftmost-first semantics, but may be different for leftmost-longest
39     /// semantics.
40     order: Vec<PatternID>,
41     /// The length of the smallest pattern, in bytes.
42     minimum_len: usize,
43     /// The largest pattern identifier. This should always be equivalent to
44     /// the number of patterns minus one in this collection.
45     max_pattern_id: PatternID,
46     /// The total number of pattern bytes across the entire collection. This
47     /// is used for reporting total heap usage in constant time.
48     total_pattern_bytes: usize,
49 }
50 
51 impl Patterns {
52     /// Create a new collection of patterns for the given match semantics. The
53     /// ID of each pattern is the index of the pattern at which it occurs in
54     /// the `by_id` slice.
55     ///
56     /// If any of the patterns in the slice given are empty, then this panics.
57     /// Similarly, if the number of patterns given is zero, then this also
58     /// panics.
new() -> Patterns59     pub fn new() -> Patterns {
60         Patterns {
61             kind: MatchKind::default(),
62             by_id: vec![],
63             order: vec![],
64             minimum_len: usize::MAX,
65             max_pattern_id: 0,
66             total_pattern_bytes: 0,
67         }
68     }
69 
70     /// Add a pattern to this collection.
71     ///
72     /// This panics if the pattern given is empty.
add(&mut self, bytes: &[u8])73     pub fn add(&mut self, bytes: &[u8]) {
74         assert!(!bytes.is_empty());
75         assert!(self.by_id.len() <= u16::MAX as usize);
76 
77         let id = self.by_id.len() as u16;
78         self.max_pattern_id = id;
79         self.order.push(id);
80         self.by_id.push(bytes.to_vec());
81         self.minimum_len = cmp::min(self.minimum_len, bytes.len());
82         self.total_pattern_bytes += bytes.len();
83     }
84 
85     /// Set the match kind semantics for this collection of patterns.
86     ///
87     /// If the kind is not set, then the default is leftmost-first.
set_match_kind(&mut self, kind: MatchKind)88     pub fn set_match_kind(&mut self, kind: MatchKind) {
89         match kind {
90             MatchKind::LeftmostFirst => {
91                 self.order.sort();
92             }
93             MatchKind::LeftmostLongest => {
94                 let (order, by_id) = (&mut self.order, &mut self.by_id);
95                 order.sort_by(|&id1, &id2| {
96                     by_id[id1 as usize]
97                         .len()
98                         .cmp(&by_id[id2 as usize].len())
99                         .reverse()
100                 });
101             }
102             MatchKind::__Nonexhaustive => unreachable!(),
103         }
104     }
105 
106     /// Return the number of patterns in this collection.
107     ///
108     /// This is guaranteed to be greater than zero.
len(&self) -> usize109     pub fn len(&self) -> usize {
110         self.by_id.len()
111     }
112 
113     /// Returns true if and only if this collection of patterns is empty.
is_empty(&self) -> bool114     pub fn is_empty(&self) -> bool {
115         self.len() == 0
116     }
117 
118     /// Returns the approximate total amount of heap used by these patterns, in
119     /// units of bytes.
heap_bytes(&self) -> usize120     pub fn heap_bytes(&self) -> usize {
121         self.order.len() * mem::size_of::<PatternID>()
122             + self.by_id.len() * mem::size_of::<Vec<u8>>()
123             + self.total_pattern_bytes
124     }
125 
126     /// Clears all heap memory associated with this collection of patterns and
127     /// resets all state such that it is a valid empty collection.
reset(&mut self)128     pub fn reset(&mut self) {
129         self.kind = MatchKind::default();
130         self.by_id.clear();
131         self.order.clear();
132         self.minimum_len = usize::MAX;
133         self.max_pattern_id = 0;
134     }
135 
136     /// Return the maximum pattern identifier in this collection. This can be
137     /// useful in searchers for ensuring that the collection of patterns they
138     /// are provided at search time and at build time have the same size.
max_pattern_id(&self) -> PatternID139     pub fn max_pattern_id(&self) -> PatternID {
140         assert_eq!((self.max_pattern_id + 1) as usize, self.len());
141         self.max_pattern_id
142     }
143 
144     /// Returns the length, in bytes, of the smallest pattern.
145     ///
146     /// This is guaranteed to be at least one.
minimum_len(&self) -> usize147     pub fn minimum_len(&self) -> usize {
148         self.minimum_len
149     }
150 
151     /// Returns the match semantics used by these patterns.
match_kind(&self) -> &MatchKind152     pub fn match_kind(&self) -> &MatchKind {
153         &self.kind
154     }
155 
156     /// Return the pattern with the given identifier. If such a pattern does
157     /// not exist, then this panics.
get(&self, id: PatternID) -> Pattern<'_>158     pub fn get(&self, id: PatternID) -> Pattern<'_> {
159         Pattern(&self.by_id[id as usize])
160     }
161 
162     /// Return the pattern with the given identifier without performing bounds
163     /// checks.
164     ///
165     /// # Safety
166     ///
167     /// Callers must ensure that a pattern with the given identifier exists
168     /// before using this method.
169     #[cfg(target_arch = "x86_64")]
get_unchecked(&self, id: PatternID) -> Pattern<'_>170     pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
171         Pattern(self.by_id.get_unchecked(id as usize))
172     }
173 
174     /// Return an iterator over all the patterns in this collection, in the
175     /// order in which they should be matched.
176     ///
177     /// Specifically, in a naive multi-pattern matcher, the following is
178     /// guaranteed to satisfy the match semantics of this collection of
179     /// patterns:
180     ///
181     /// ```ignore
182     /// for i in 0..haystack.len():
183     ///   for p in patterns.iter():
184     ///     if haystack[i..].starts_with(p.bytes()):
185     ///       return Match(p.id(), i, i + p.bytes().len())
186     /// ```
187     ///
188     /// Namely, among the patterns in a collection, if they are matched in
189     /// the order provided by this iterator, then the result is guaranteed
190     /// to satisfy the correct match semantics. (Either leftmost-first or
191     /// leftmost-longest.)
iter(&self) -> PatternIter<'_>192     pub fn iter(&self) -> PatternIter<'_> {
193         PatternIter { patterns: self, i: 0 }
194     }
195 }
196 
197 /// An iterator over the patterns in the `Patterns` collection.
198 ///
199 /// The order of the patterns provided by this iterator is consistent with the
200 /// match semantics of the originating collection of patterns.
201 ///
202 /// The lifetime `'p` corresponds to the lifetime of the collection of patterns
203 /// this is iterating over.
204 #[derive(Debug)]
205 pub struct PatternIter<'p> {
206     patterns: &'p Patterns,
207     i: usize,
208 }
209 
210 impl<'p> Iterator for PatternIter<'p> {
211     type Item = (PatternID, Pattern<'p>);
212 
next(&mut self) -> Option<(PatternID, Pattern<'p>)>213     fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
214         if self.i >= self.patterns.len() {
215             return None;
216         }
217         let id = self.patterns.order[self.i];
218         let p = self.patterns.get(id);
219         self.i += 1;
220         Some((id, p))
221     }
222 }
223 
224 /// A pattern that is used in packed searching.
225 #[derive(Clone)]
226 pub struct Pattern<'a>(&'a [u8]);
227 
228 impl<'a> fmt::Debug for Pattern<'a> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result229     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230         f.debug_struct("Pattern")
231             .field("lit", &String::from_utf8_lossy(&self.0))
232             .finish()
233     }
234 }
235 
236 impl<'p> Pattern<'p> {
237     /// Returns the length of this pattern, in bytes.
len(&self) -> usize238     pub fn len(&self) -> usize {
239         self.0.len()
240     }
241 
242     /// Returns the bytes of this pattern.
bytes(&self) -> &[u8]243     pub fn bytes(&self) -> &[u8] {
244         &self.0
245     }
246 
247     /// Returns the first `len` low nybbles from this pattern. If this pattern
248     /// is shorter than `len`, then this panics.
249     #[cfg(target_arch = "x86_64")]
low_nybbles(&self, len: usize) -> Vec<u8>250     pub fn low_nybbles(&self, len: usize) -> Vec<u8> {
251         let mut nybs = vec![];
252         for &b in self.bytes().iter().take(len) {
253             nybs.push(b & 0xF);
254         }
255         nybs
256     }
257 
258     /// Returns true if this pattern is a prefix of the given bytes.
259     #[inline(always)]
is_prefix(&self, bytes: &[u8]) -> bool260     pub fn is_prefix(&self, bytes: &[u8]) -> bool {
261         self.len() <= bytes.len() && self.equals(&bytes[..self.len()])
262     }
263 
264     /// Returns true if and only if this pattern equals the given bytes.
265     #[inline(always)]
equals(&self, bytes: &[u8]) -> bool266     pub fn equals(&self, bytes: &[u8]) -> bool {
267         // Why not just use memcmp for this? Well, memcmp requires calling out
268         // to libc, and this routine is called in fairly hot code paths. Other
269         // than just calling out to libc, it also seems to result in worse
270         // codegen. By rolling our own memcpy in pure Rust, it seems to appear
271         // more friendly to the optimizer.
272         //
273         // This results in an improvement in just about every benchmark. Some
274         // smaller than others, but in some cases, up to 30% faster.
275 
276         if self.len() != bytes.len() {
277             return false;
278         }
279         if self.len() < 8 {
280             for (&b1, &b2) in self.bytes().iter().zip(bytes) {
281                 if b1 != b2 {
282                     return false;
283                 }
284             }
285             return true;
286         }
287         // When we have 8 or more bytes to compare, then proceed in chunks of
288         // 8 at a time using unaligned loads.
289         let mut p1 = self.bytes().as_ptr();
290         let mut p2 = bytes.as_ptr();
291         let p1end = self.bytes()[self.len() - 8..].as_ptr();
292         let p2end = bytes[bytes.len() - 8..].as_ptr();
293         // SAFETY: Via the conditional above, we know that both `p1` and `p2`
294         // have the same length, so `p1 < p1end` implies that `p2 < p2end`.
295         // Thus, derefencing both `p1` and `p2` in the loop below is safe.
296         //
297         // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual
298         // end of of `p1` and `p2`. Thus, the final dereference outside of the
299         // loop is guaranteed to be valid.
300         //
301         // Finally, we needn't worry about 64-bit alignment here, since we
302         // do unaligned loads.
303         unsafe {
304             while p1 < p1end {
305                 let v1 = (p1 as *const u64).read_unaligned();
306                 let v2 = (p2 as *const u64).read_unaligned();
307                 if v1 != v2 {
308                     return false;
309                 }
310                 p1 = p1.add(8);
311                 p2 = p2.add(8);
312             }
313             let v1 = (p1end as *const u64).read_unaligned();
314             let v2 = (p2end as *const u64).read_unaligned();
315             v1 == v2
316         }
317     }
318 }
319