1 use std::mem; 2 3 use packed::pattern::{PatternID, Patterns}; 4 use Match; 5 6 /// The type of the rolling hash used in the Rabin-Karp algorithm. 7 type Hash = usize; 8 9 /// The number of buckets to store our patterns in. We don't want this to be 10 /// too big in order to avoid wasting memory, but we don't want it to be too 11 /// small either to avoid spending too much time confirming literals. 12 /// 13 /// The number of buckets MUST be a power of two. Otherwise, determining the 14 /// bucket from a hash will slow down the code considerably. Using a power 15 /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` 16 /// instruction. 17 const NUM_BUCKETS: usize = 64; 18 19 /// An implementation of the Rabin-Karp algorithm. The main idea of this 20 /// algorithm is to maintain a rolling hash as it moves through the input, and 21 /// then check whether that hash corresponds to the same hash for any of the 22 /// patterns we're looking for. 23 /// 24 /// A draw back of naively scaling Rabin-Karp to multiple patterns is that 25 /// it requires all of the patterns to be the same length, which in turn 26 /// corresponds to the number of bytes to hash. We adapt this to work for 27 /// multiple patterns of varying size by fixing the number of bytes to hash 28 /// to be the length of the smallest pattern. We also split the patterns into 29 /// several buckets to hopefully make the confirmation step faster. 30 /// 31 /// Wikipedia has a decent explanation, if a bit heavy on the theory: 32 /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm 33 /// 34 /// But ESMAJ provides something a bit more concrete: 35 /// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html 36 #[derive(Clone, Debug)] 37 pub struct RabinKarp { 38 /// The order of patterns in each bucket is significant. Namely, they are 39 /// arranged such that the first one to match is the correct match. This 40 /// may not necessarily correspond to the order provided by the caller. 41 /// For example, if leftmost-longest semantics are used, then the patterns 42 /// are sorted by their length in descending order. If leftmost-first 43 /// semantics are used, then the patterns are sorted by their pattern ID 44 /// in ascending order (which corresponds to the caller's order). 45 buckets: Vec<Vec<(Hash, PatternID)>>, 46 /// The length of the hashing window. Generally, this corresponds to the 47 /// length of the smallest pattern. 48 hash_len: usize, 49 /// The factor to subtract out of a hash before updating it with a new 50 /// byte. 51 hash_2pow: usize, 52 /// The maximum identifier of a pattern. This is used as a sanity check 53 /// to ensure that the patterns provided by the caller are the same as 54 /// the patterns that were used to compile the matcher. This sanity check 55 /// possibly permits safely eliminating bounds checks regardless of what 56 /// patterns are provided by the caller. 57 /// 58 /// (Currently, we don't use this to elide bounds checks since it doesn't 59 /// result in a measurable performance improvement, but we do use it for 60 /// better failure modes.) 61 max_pattern_id: PatternID, 62 } 63 64 impl RabinKarp { 65 /// Compile a new Rabin-Karp matcher from the patterns given. 66 /// 67 /// This panics if any of the patterns in the collection are empty, or if 68 /// the collection is itself empty. new(patterns: &Patterns) -> RabinKarp69 pub fn new(patterns: &Patterns) -> RabinKarp { 70 assert!(patterns.len() >= 1); 71 let hash_len = patterns.minimum_len(); 72 assert!(hash_len >= 1); 73 74 let mut hash_2pow = 1usize; 75 for _ in 1..hash_len { 76 hash_2pow = hash_2pow.wrapping_shl(1); 77 } 78 79 let mut rk = RabinKarp { 80 buckets: vec![vec![]; NUM_BUCKETS], 81 hash_len, 82 hash_2pow, 83 max_pattern_id: patterns.max_pattern_id(), 84 }; 85 for (id, pat) in patterns.iter() { 86 let hash = rk.hash(&pat.bytes()[..rk.hash_len]); 87 let bucket = hash % NUM_BUCKETS; 88 rk.buckets[bucket].push((hash, id)); 89 } 90 rk 91 } 92 93 /// Return the first matching pattern in the given haystack, begining the 94 /// search at `at`. find_at( &self, patterns: &Patterns, haystack: &[u8], mut at: usize, ) -> Option<Match>95 pub fn find_at( 96 &self, 97 patterns: &Patterns, 98 haystack: &[u8], 99 mut at: usize, 100 ) -> Option<Match> { 101 assert_eq!(NUM_BUCKETS, self.buckets.len()); 102 assert_eq!( 103 self.max_pattern_id, 104 patterns.max_pattern_id(), 105 "Rabin-Karp must be called with same patterns it was built with", 106 ); 107 108 if at + self.hash_len > haystack.len() { 109 return None; 110 } 111 let mut hash = self.hash(&haystack[at..at + self.hash_len]); 112 loop { 113 let bucket = &self.buckets[hash % NUM_BUCKETS]; 114 for &(phash, pid) in bucket { 115 if phash == hash { 116 if let Some(c) = self.verify(patterns, pid, haystack, at) { 117 return Some(c); 118 } 119 } 120 } 121 if at + self.hash_len >= haystack.len() { 122 return None; 123 } 124 hash = self.update_hash( 125 hash, 126 haystack[at], 127 haystack[at + self.hash_len], 128 ); 129 at += 1; 130 } 131 } 132 133 /// Returns the approximate total amount of heap used by this searcher, in 134 /// units of bytes. heap_bytes(&self) -> usize135 pub fn heap_bytes(&self) -> usize { 136 let num_patterns = self.max_pattern_id as usize + 1; 137 self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>() 138 + num_patterns * mem::size_of::<(Hash, PatternID)>() 139 } 140 141 /// Verify whether the pattern with the given id matches at 142 /// `haystack[at..]`. 143 /// 144 /// We tag this function as `cold` because it helps improve codegen. 145 /// Intuitively, it would seem like inlining it would be better. However, 146 /// the only time this is called and a match is not found is when there 147 /// there is a hash collision, or when a prefix of a pattern matches but 148 /// the entire pattern doesn't match. This is hopefully fairly rare, and 149 /// if it does occur a lot, it's going to be slow no matter what we do. 150 #[cold] verify( &self, patterns: &Patterns, id: PatternID, haystack: &[u8], at: usize, ) -> Option<Match>151 fn verify( 152 &self, 153 patterns: &Patterns, 154 id: PatternID, 155 haystack: &[u8], 156 at: usize, 157 ) -> Option<Match> { 158 let pat = patterns.get(id); 159 if pat.is_prefix(&haystack[at..]) { 160 Some(Match::from_span(id as usize, at, at + pat.len())) 161 } else { 162 None 163 } 164 } 165 166 /// Hash the given bytes. hash(&self, bytes: &[u8]) -> Hash167 fn hash(&self, bytes: &[u8]) -> Hash { 168 assert_eq!(self.hash_len, bytes.len()); 169 170 let mut hash = 0usize; 171 for &b in bytes { 172 hash = hash.wrapping_shl(1).wrapping_add(b as usize); 173 } 174 hash 175 } 176 177 /// Update the hash given based on removing `old_byte` at the beginning 178 /// of some byte string, and appending `new_byte` to the end of that same 179 /// byte string. update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash180 fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { 181 prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) 182 .wrapping_shl(1) 183 .wrapping_add(new_byte as usize) 184 } 185 } 186