1 use std::fmt; 2 3 /// A representation of byte oriented equivalence classes. 4 /// 5 /// This is used in an FSM to reduce the size of the transition table. This can 6 /// have a particularly large impact not only on the total size of an FSM, but 7 /// also on compile times. 8 #[derive(Clone, Copy)] 9 pub struct ByteClasses([u8; 256]); 10 11 impl ByteClasses { 12 /// Creates a new set of equivalence classes where all bytes are mapped to 13 /// the same class. empty() -> ByteClasses14 pub fn empty() -> ByteClasses { 15 ByteClasses([0; 256]) 16 } 17 18 /// Creates a new set of equivalence classes where each byte belongs to 19 /// its own equivalence class. singletons() -> ByteClasses20 pub fn singletons() -> ByteClasses { 21 let mut classes = ByteClasses::empty(); 22 for i in 0..256 { 23 classes.set(i as u8, i as u8); 24 } 25 classes 26 } 27 28 /// Set the equivalence class for the given byte. 29 #[inline] set(&mut self, byte: u8, class: u8)30 pub fn set(&mut self, byte: u8, class: u8) { 31 self.0[byte as usize] = class; 32 } 33 34 /// Get the equivalence class for the given byte. 35 #[inline] get(&self, byte: u8) -> u836 pub fn get(&self, byte: u8) -> u8 { 37 // SAFETY: This is safe because all dense transitions have 38 // exactly 256 elements, so all u8 values are valid indices. 39 self.0[byte as usize] 40 } 41 42 /// Return the total number of elements in the alphabet represented by 43 /// these equivalence classes. Equivalently, this returns the total number 44 /// of equivalence classes. 45 #[inline] alphabet_len(&self) -> usize46 pub fn alphabet_len(&self) -> usize { 47 self.0[255] as usize + 1 48 } 49 50 /// Returns true if and only if every byte in this class maps to its own 51 /// equivalence class. Equivalently, there are 256 equivalence classes 52 /// and each class contains exactly one byte. 53 #[inline] is_singleton(&self) -> bool54 pub fn is_singleton(&self) -> bool { 55 self.alphabet_len() == 256 56 } 57 58 /// Returns an iterator over a sequence of representative bytes from each 59 /// equivalence class. Namely, this yields exactly N items, where N is 60 /// equivalent to the number of equivalence classes. Each item is an 61 /// arbitrary byte drawn from each equivalence class. 62 /// 63 /// This is useful when one is determinizing an NFA and the NFA's alphabet 64 /// hasn't been converted to equivalence classes yet. Picking an arbitrary 65 /// byte from each equivalence class then permits a full exploration of 66 /// the NFA instead of using every possible byte value. representatives(&self) -> ByteClassRepresentatives67 pub fn representatives(&self) -> ByteClassRepresentatives { 68 ByteClassRepresentatives { classes: self, byte: 0, last_class: None } 69 } 70 71 /// Returns all of the bytes in the given equivalence class. 72 /// 73 /// The second element in the tuple indicates the number of elements in 74 /// the array. elements(&self, equiv: u8) -> ([u8; 256], usize)75 fn elements(&self, equiv: u8) -> ([u8; 256], usize) { 76 let (mut array, mut len) = ([0; 256], 0); 77 for b in 0..256 { 78 if self.get(b as u8) == equiv { 79 array[len] = b as u8; 80 len += 1; 81 } 82 } 83 (array, len) 84 } 85 } 86 87 impl fmt::Debug for ByteClasses { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result88 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 89 if self.is_singleton() { 90 write!(f, "ByteClasses({{singletons}})") 91 } else { 92 write!(f, "ByteClasses(")?; 93 for equiv in 0..self.alphabet_len() { 94 let (members, len) = self.elements(equiv as u8); 95 write!(f, " {} => {:?}", equiv, &members[..len])?; 96 } 97 write!(f, ")") 98 } 99 } 100 } 101 102 /// An iterator over representative bytes from each equivalence class. 103 #[derive(Debug)] 104 pub struct ByteClassRepresentatives<'a> { 105 classes: &'a ByteClasses, 106 byte: usize, 107 last_class: Option<u8>, 108 } 109 110 impl<'a> Iterator for ByteClassRepresentatives<'a> { 111 type Item = u8; 112 next(&mut self) -> Option<u8>113 fn next(&mut self) -> Option<u8> { 114 while self.byte < 256 { 115 let byte = self.byte as u8; 116 let class = self.classes.get(byte); 117 self.byte += 1; 118 119 if self.last_class != Some(class) { 120 self.last_class = Some(class); 121 return Some(byte); 122 } 123 } 124 None 125 } 126 } 127 128 /// A byte class builder keeps track of an *approximation* of equivalence 129 /// classes of bytes during NFA construction. That is, every byte in an 130 /// equivalence class cannot discriminate between a match and a non-match. 131 /// 132 /// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w] 133 /// and [{-\xFF] never discriminate between a match and a non-match, precisely 134 /// because they never occur in the literals anywhere. 135 /// 136 /// Note though that this does not necessarily compute the minimal set of 137 /// equivalence classes. For example, in the literals above, the byte ranges 138 /// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence 139 /// classes even though they could be treated a single class. The reason for 140 /// this is implementation complexity. In the future, we should endeavor to 141 /// compute the minimal equivalence classes since they can have a rather large 142 /// impact on the size of the DFA. 143 /// 144 /// The representation here is 256 booleans, all initially set to false. Each 145 /// boolean maps to its corresponding byte based on position. A `true` value 146 /// indicates the end of an equivalence class, where its corresponding byte 147 /// and all of the bytes corresponding to all previous contiguous `false` 148 /// values are in the same equivalence class. 149 /// 150 /// This particular representation only permits contiguous ranges of bytes to 151 /// be in the same equivalence class, which means that we can never discover 152 /// the true minimal set of equivalence classes. 153 #[derive(Debug)] 154 pub struct ByteClassBuilder(Vec<bool>); 155 156 impl ByteClassBuilder { 157 /// Create a new builder of byte classes where all bytes are part of the 158 /// same equivalence class. new() -> ByteClassBuilder159 pub fn new() -> ByteClassBuilder { 160 ByteClassBuilder(vec![false; 256]) 161 } 162 163 /// Indicate the the range of byte given (inclusive) can discriminate a 164 /// match between it and all other bytes outside of the range. set_range(&mut self, start: u8, end: u8)165 pub fn set_range(&mut self, start: u8, end: u8) { 166 debug_assert!(start <= end); 167 if start > 0 { 168 self.0[start as usize - 1] = true; 169 } 170 self.0[end as usize] = true; 171 } 172 173 /// Build byte classes that map all byte values to their corresponding 174 /// equivalence class. The last mapping indicates the largest equivalence 175 /// class identifier (which is never bigger than 255). build(&self) -> ByteClasses176 pub fn build(&self) -> ByteClasses { 177 let mut classes = ByteClasses::empty(); 178 let mut class = 0u8; 179 let mut i = 0; 180 loop { 181 classes.set(i as u8, class as u8); 182 if i >= 255 { 183 break; 184 } 185 if self.0[i] { 186 class = class.checked_add(1).unwrap(); 187 } 188 i += 1; 189 } 190 classes 191 } 192 } 193 194 #[cfg(test)] 195 mod tests { 196 use super::*; 197 198 #[test] byte_classes()199 fn byte_classes() { 200 let mut set = ByteClassBuilder::new(); 201 set.set_range(b'a', b'z'); 202 203 let classes = set.build(); 204 assert_eq!(classes.get(0), 0); 205 assert_eq!(classes.get(1), 0); 206 assert_eq!(classes.get(2), 0); 207 assert_eq!(classes.get(b'a' - 1), 0); 208 assert_eq!(classes.get(b'a'), 1); 209 assert_eq!(classes.get(b'm'), 1); 210 assert_eq!(classes.get(b'z'), 1); 211 assert_eq!(classes.get(b'z' + 1), 2); 212 assert_eq!(classes.get(254), 2); 213 assert_eq!(classes.get(255), 2); 214 215 let mut set = ByteClassBuilder::new(); 216 set.set_range(0, 2); 217 set.set_range(4, 6); 218 let classes = set.build(); 219 assert_eq!(classes.get(0), 0); 220 assert_eq!(classes.get(1), 0); 221 assert_eq!(classes.get(2), 0); 222 assert_eq!(classes.get(3), 1); 223 assert_eq!(classes.get(4), 2); 224 assert_eq!(classes.get(5), 2); 225 assert_eq!(classes.get(6), 2); 226 assert_eq!(classes.get(7), 3); 227 assert_eq!(classes.get(255), 3); 228 } 229 230 #[test] full_byte_classes()231 fn full_byte_classes() { 232 let mut set = ByteClassBuilder::new(); 233 for i in 0..256u16 { 234 set.set_range(i as u8, i as u8); 235 } 236 assert_eq!(set.build().alphabet_len(), 256); 237 } 238 } 239