// See the README in this directory for an explanation of the Teddy algorithm. use std::cmp; use std::collections::BTreeMap; use std::fmt; use crate::packed::pattern::{PatternID, Patterns}; use crate::packed::teddy::Teddy; /// A builder for constructing a Teddy matcher. /// /// The builder primarily permits fine grained configuration of the Teddy /// matcher. Most options are made only available for testing/benchmarking /// purposes. In reality, options are automatically determined by the nature /// and number of patterns given to the builder. #[derive(Clone, Debug)] pub struct Builder { /// When none, this is automatically determined. Otherwise, `false` means /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't /// available and Fat Teddy was requested, no matcher will be built. fat: Option, /// When none, this is automatically determined. Otherwise, `false` means /// that 128-bit vectors will be used (up to SSSE3 instructions) where as /// `true` means that 256-bit vectors will be used. As with `fat`, if /// 256-bit vectors are requested and they aren't available, then a /// searcher will not be built. avx: Option, } impl Default for Builder { fn default() -> Builder { Builder::new() } } impl Builder { /// Create a new builder for configuring a Teddy matcher. pub fn new() -> Builder { Builder { fat: None, avx: None } } /// Build a matcher for the set of patterns given. If a matcher could not /// be built, then `None` is returned. /// /// Generally, a matcher isn't built if the necessary CPU features aren't /// available, an unsupported target or if the searcher is believed to be /// slower than standard techniques (i.e., if there are too many literals). pub fn build(&self, patterns: &Patterns) -> Option { self.build_imp(patterns) } /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful /// for a larger set of literals. /// /// `None` is the default, which results in an automatic selection based /// on the number of literals and available CPU features. pub fn fat(&mut self, yes: Option) -> &mut Builder { self.fat = yes; self } /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). /// Generally, a larger vector size is better since it either permits /// matching more patterns or matching more bytes in the haystack at once. /// /// `None` is the default, which results in an automatic selection based on /// the number of literals and available CPU features. pub fn avx(&mut self, yes: Option) -> &mut Builder { self.avx = yes; self } fn build_imp(&self, patterns: &Patterns) -> Option { use crate::packed::teddy::runtime; // Most of the logic here is just about selecting the optimal settings, // or perhaps even rejecting construction altogether. The choices // we have are: fat (avx only) or not, ssse3 or avx2, and how many // patterns we allow ourselves to search. Additionally, for testing // and benchmarking, we permit callers to try to "force" a setting, // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't // available), then we bail and return nothing. if patterns.len() > 64 { return None; } let has_ssse3 = is_x86_feature_detected!("ssse3"); let has_avx = is_x86_feature_detected!("avx2"); let avx = if self.avx == Some(true) { if !has_avx { return None; } true } else if self.avx == Some(false) { if !has_ssse3 { return None; } false } else if !has_ssse3 && !has_avx { return None; } else { has_avx }; let fat = match self.fat { None => avx && patterns.len() > 32, Some(false) => false, Some(true) if !avx => return None, Some(true) => true, }; let mut compiler = Compiler::new(patterns, fat); compiler.compile(); let Compiler { buckets, masks, .. } = compiler; // SAFETY: It is required that the builder only produce Teddy matchers // that are allowed to run on the current CPU, since we later assume // that the presence of (for example) TeddySlim1Mask256 means it is // safe to call functions marked with the `avx2` target feature. match (masks.len(), avx, fat) { (1, false, _) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim1Mask128( runtime::TeddySlim1Mask128 { mask1: runtime::Mask128::new(masks[0]), }, ), }), (1, true, false) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim1Mask256( runtime::TeddySlim1Mask256 { mask1: runtime::Mask256::new(masks[0]), }, ), }), (1, true, true) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat1Mask256( runtime::TeddyFat1Mask256 { mask1: runtime::Mask256::new(masks[0]), }, ), }), (2, false, _) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim2Mask128( runtime::TeddySlim2Mask128 { mask1: runtime::Mask128::new(masks[0]), mask2: runtime::Mask128::new(masks[1]), }, ), }), (2, true, false) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim2Mask256( runtime::TeddySlim2Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), }, ), }), (2, true, true) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat2Mask256( runtime::TeddyFat2Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), }, ), }), (3, false, _) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim3Mask128( runtime::TeddySlim3Mask128 { mask1: runtime::Mask128::new(masks[0]), mask2: runtime::Mask128::new(masks[1]), mask3: runtime::Mask128::new(masks[2]), }, ), }), (3, true, false) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim3Mask256( runtime::TeddySlim3Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), mask3: runtime::Mask256::new(masks[2]), }, ), }), (3, true, true) => Some(Teddy { buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat3Mask256( runtime::TeddyFat3Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), mask3: runtime::Mask256::new(masks[2]), }, ), }), _ => unreachable!(), } } } /// A compiler is in charge of allocating patterns into buckets and generating /// the masks necessary for searching. #[derive(Clone)] struct Compiler<'p> { patterns: &'p Patterns, buckets: Vec>, masks: Vec, } impl<'p> Compiler<'p> { /// Create a new Teddy compiler for the given patterns. If `fat` is true, /// then 16 buckets will be used instead of 8. /// /// This panics if any of the patterns given are empty. fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { let mask_len = cmp::min(3, patterns.minimum_len()); assert!(1 <= mask_len && mask_len <= 3); Compiler { patterns, buckets: vec![vec![]; if fat { 16 } else { 8 }], masks: vec![Mask::default(); mask_len], } } /// Compile the patterns in this compiler into buckets and masks. fn compile(&mut self) { let mut lonibble_to_bucket: BTreeMap, usize> = BTreeMap::new(); for (id, pattern) in self.patterns.iter() { // We try to be slightly clever in how we assign patterns into // buckets. Generally speaking, we want patterns with the same // prefix to be in the same bucket, since it minimizes the amount // of time we spend churning through buckets in the verification // step. // // So we could assign patterns with the same N-prefix (where N // is the size of the mask, which is one of {1, 2, 3}) to the // same bucket. However, case insensitive searches are fairly // common, so we'd for example, ideally want to treat `abc` and // `ABC` as if they shared the same prefix. ASCII has the nice // property that the lower 4 bits of A and a are the same, so we // therefore group patterns with the same low-nybbe-N-prefix into // the same bucket. // // MOREOVER, this is actually necessary for correctness! In // particular, by grouping patterns with the same prefix into the // same bucket, we ensure that we preserve correct leftmost-first // and leftmost-longest match semantics. In addition to the fact // that `patterns.iter()` iterates in the correct order, this // guarantees that all possible ambiguous matches will occur in // the same bucket. The verification routine could be adjusted to // support correct leftmost match semantics regardless of bucket // allocation, but that results in a performance hit. It's much // nicer to be able to just stop as soon as a match is found. let lonybs = pattern.low_nybbles(self.masks.len()); if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { self.buckets[bucket].push(id); } else { // N.B. We assign buckets in reverse because it shouldn't have // any influence on performance, but it does make it harder to // get leftmost match semantics accidentally correct. let bucket = (self.buckets.len() - 1) - (id as usize % self.buckets.len()); self.buckets[bucket].push(id); lonibble_to_bucket.insert(lonybs, bucket); } } for (bucket_index, bucket) in self.buckets.iter().enumerate() { for &pat_id in bucket { let pat = self.patterns.get(pat_id); for (i, mask) in self.masks.iter_mut().enumerate() { if self.buckets.len() == 8 { mask.add_slim(bucket_index as u8, pat.bytes()[i]); } else { mask.add_fat(bucket_index as u8, pat.bytes()[i]); } } } } } } impl<'p> fmt::Debug for Compiler<'p> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut buckets = vec![vec![]; self.buckets.len()]; for (i, bucket) in self.buckets.iter().enumerate() { for &patid in bucket { buckets[i].push(self.patterns.get(patid)); } } f.debug_struct("Compiler") .field("buckets", &buckets) .field("masks", &self.masks) .finish() } } /// Mask represents the low and high nybble masks that will be used during /// search. Each mask is 32 bytes wide, although only the first 16 bytes are /// used for the SSSE3 runtime. /// /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set /// if and only if the corresponding nybble is in the ith bucket. The index of /// the byte (0-15, inclusive) corresponds to the nybble. /// /// Each mask is used as the target of a shuffle, where the indices for the /// shuffle are taken from the haystack. AND'ing the shuffles for both the /// low and high masks together also results in 8-bit bitsets, but where bit /// `i` is set if and only if the correspond *byte* is in the ith bucket. /// /// During compilation, masks are just arrays. But during search, these masks /// are represented as 128-bit or 256-bit vectors. /// /// (See the README is this directory for more details.) #[derive(Clone, Copy, Default)] pub struct Mask { lo: [u8; 32], hi: [u8; 32], } impl Mask { /// Update this mask by adding the given byte to the given bucket. The /// given bucket must be in the range 0-7. /// /// This is for "slim" Teddy, where there are only 8 buckets. fn add_slim(&mut self, bucket: u8, byte: u8) { assert!(bucket < 8); let byte_lo = (byte & 0xF) as usize; let byte_hi = ((byte >> 4) & 0xF) as usize; // When using 256-bit vectors, we need to set this bucket assignment in // the low and high 128-bit portions of the mask. This allows us to // process 32 bytes at a time. Namely, AVX2 shuffles operate on each // of the 128-bit lanes, rather than the full 256-bit vector at once. self.lo[byte_lo] |= 1 << bucket; self.lo[byte_lo + 16] |= 1 << bucket; self.hi[byte_hi] |= 1 << bucket; self.hi[byte_hi + 16] |= 1 << bucket; } /// Update this mask by adding the given byte to the given bucket. The /// given bucket must be in the range 0-15. /// /// This is for "fat" Teddy, where there are 16 buckets. fn add_fat(&mut self, bucket: u8, byte: u8) { assert!(bucket < 16); let byte_lo = (byte & 0xF) as usize; let byte_hi = ((byte >> 4) & 0xF) as usize; // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, // the high 128 bits of our mask correspond to buckets 8-15, while the // low 128 bits correspond to buckets 0-7. if bucket < 8 { self.lo[byte_lo] |= 1 << bucket; self.hi[byte_hi] |= 1 << bucket; } else { self.lo[byte_lo + 16] |= 1 << (bucket % 8); self.hi[byte_hi + 16] |= 1 << (bucket % 8); } } /// Return the low 128 bits of the low-nybble mask. pub fn lo128(&self) -> [u8; 16] { let mut tmp = [0; 16]; tmp.copy_from_slice(&self.lo[..16]); tmp } /// Return the full low-nybble mask. pub fn lo256(&self) -> [u8; 32] { self.lo } /// Return the low 128 bits of the high-nybble mask. pub fn hi128(&self) -> [u8; 16] { let mut tmp = [0; 16]; tmp.copy_from_slice(&self.hi[..16]); tmp } /// Return the full high-nybble mask. pub fn hi256(&self) -> [u8; 32] { self.hi } } impl fmt::Debug for Mask { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let (mut parts_lo, mut parts_hi) = (vec![], vec![]); for i in 0..32 { parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); } f.debug_struct("Mask") .field("lo", &parts_lo) .field("hi", &parts_hi) .finish() } }