• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::fmt;
2 
3 /// A representation of byte oriented equivalence classes.
4 ///
5 /// This is used in an FSM to reduce the size of the transition table. This can
6 /// have a particularly large impact not only on the total size of an FSM, but
7 /// also on compile times.
8 #[derive(Clone, Copy)]
9 pub struct ByteClasses([u8; 256]);
10 
11 impl ByteClasses {
12     /// Creates a new set of equivalence classes where all bytes are mapped to
13     /// the same class.
empty() -> ByteClasses14     pub fn empty() -> ByteClasses {
15         ByteClasses([0; 256])
16     }
17 
18     /// Creates a new set of equivalence classes where each byte belongs to
19     /// its own equivalence class.
singletons() -> ByteClasses20     pub fn singletons() -> ByteClasses {
21         let mut classes = ByteClasses::empty();
22         for i in 0..256 {
23             classes.set(i as u8, i as u8);
24         }
25         classes
26     }
27 
28     /// Set the equivalence class for the given byte.
29     #[inline]
set(&mut self, byte: u8, class: u8)30     pub fn set(&mut self, byte: u8, class: u8) {
31         self.0[byte as usize] = class;
32     }
33 
34     /// Get the equivalence class for the given byte.
35     #[inline]
get(&self, byte: u8) -> u836     pub fn get(&self, byte: u8) -> u8 {
37         // SAFETY: This is safe because all dense transitions have
38         // exactly 256 elements, so all u8 values are valid indices.
39         self.0[byte as usize]
40     }
41 
42     /// Return the total number of elements in the alphabet represented by
43     /// these equivalence classes. Equivalently, this returns the total number
44     /// of equivalence classes.
45     #[inline]
alphabet_len(&self) -> usize46     pub fn alphabet_len(&self) -> usize {
47         self.0[255] as usize + 1
48     }
49 
50     /// Returns true if and only if every byte in this class maps to its own
51     /// equivalence class. Equivalently, there are 256 equivalence classes
52     /// and each class contains exactly one byte.
53     #[inline]
is_singleton(&self) -> bool54     pub fn is_singleton(&self) -> bool {
55         self.alphabet_len() == 256
56     }
57 
58     /// Returns an iterator over a sequence of representative bytes from each
59     /// equivalence class. Namely, this yields exactly N items, where N is
60     /// equivalent to the number of equivalence classes. Each item is an
61     /// arbitrary byte drawn from each equivalence class.
62     ///
63     /// This is useful when one is determinizing an NFA and the NFA's alphabet
64     /// hasn't been converted to equivalence classes yet. Picking an arbitrary
65     /// byte from each equivalence class then permits a full exploration of
66     /// the NFA instead of using every possible byte value.
representatives(&self) -> ByteClassRepresentatives67     pub fn representatives(&self) -> ByteClassRepresentatives {
68         ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
69     }
70 
71     /// Returns all of the bytes in the given equivalence class.
72     ///
73     /// The second element in the tuple indicates the number of elements in
74     /// the array.
elements(&self, equiv: u8) -> ([u8; 256], usize)75     fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
76         let (mut array, mut len) = ([0; 256], 0);
77         for b in 0..256 {
78             if self.get(b as u8) == equiv {
79                 array[len] = b as u8;
80                 len += 1;
81             }
82         }
83         (array, len)
84     }
85 }
86 
87 impl fmt::Debug for ByteClasses {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result88     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
89         if self.is_singleton() {
90             write!(f, "ByteClasses({{singletons}})")
91         } else {
92             write!(f, "ByteClasses(")?;
93             for equiv in 0..self.alphabet_len() {
94                 let (members, len) = self.elements(equiv as u8);
95                 write!(f, " {} => {:?}", equiv, &members[..len])?;
96             }
97             write!(f, ")")
98         }
99     }
100 }
101 
102 /// An iterator over representative bytes from each equivalence class.
103 #[derive(Debug)]
104 pub struct ByteClassRepresentatives<'a> {
105     classes: &'a ByteClasses,
106     byte: usize,
107     last_class: Option<u8>,
108 }
109 
110 impl<'a> Iterator for ByteClassRepresentatives<'a> {
111     type Item = u8;
112 
next(&mut self) -> Option<u8>113     fn next(&mut self) -> Option<u8> {
114         while self.byte < 256 {
115             let byte = self.byte as u8;
116             let class = self.classes.get(byte);
117             self.byte += 1;
118 
119             if self.last_class != Some(class) {
120                 self.last_class = Some(class);
121                 return Some(byte);
122             }
123         }
124         None
125     }
126 }
127 
128 /// A byte class builder keeps track of an *approximation* of equivalence
129 /// classes of bytes during NFA construction. That is, every byte in an
130 /// equivalence class cannot discriminate between a match and a non-match.
131 ///
132 /// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w]
133 /// and [{-\xFF] never discriminate between a match and a non-match, precisely
134 /// because they never occur in the literals anywhere.
135 ///
136 /// Note though that this does not necessarily compute the minimal set of
137 /// equivalence classes. For example, in the literals above, the byte ranges
138 /// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence
139 /// classes even though they could be treated a single class. The reason for
140 /// this is implementation complexity. In the future, we should endeavor to
141 /// compute the minimal equivalence classes since they can have a rather large
142 /// impact on the size of the DFA.
143 ///
144 /// The representation here is 256 booleans, all initially set to false. Each
145 /// boolean maps to its corresponding byte based on position. A `true` value
146 /// indicates the end of an equivalence class, where its corresponding byte
147 /// and all of the bytes corresponding to all previous contiguous `false`
148 /// values are in the same equivalence class.
149 ///
150 /// This particular representation only permits contiguous ranges of bytes to
151 /// be in the same equivalence class, which means that we can never discover
152 /// the true minimal set of equivalence classes.
153 #[derive(Debug)]
154 pub struct ByteClassBuilder(Vec<bool>);
155 
156 impl ByteClassBuilder {
157     /// Create a new builder of byte classes where all bytes are part of the
158     /// same equivalence class.
new() -> ByteClassBuilder159     pub fn new() -> ByteClassBuilder {
160         ByteClassBuilder(vec![false; 256])
161     }
162 
163     /// Indicate the the range of byte given (inclusive) can discriminate a
164     /// match between it and all other bytes outside of the range.
set_range(&mut self, start: u8, end: u8)165     pub fn set_range(&mut self, start: u8, end: u8) {
166         debug_assert!(start <= end);
167         if start > 0 {
168             self.0[start as usize - 1] = true;
169         }
170         self.0[end as usize] = true;
171     }
172 
173     /// Build byte classes that map all byte values to their corresponding
174     /// equivalence class. The last mapping indicates the largest equivalence
175     /// class identifier (which is never bigger than 255).
build(&self) -> ByteClasses176     pub fn build(&self) -> ByteClasses {
177         let mut classes = ByteClasses::empty();
178         let mut class = 0u8;
179         let mut i = 0;
180         loop {
181             classes.set(i as u8, class as u8);
182             if i >= 255 {
183                 break;
184             }
185             if self.0[i] {
186                 class = class.checked_add(1).unwrap();
187             }
188             i += 1;
189         }
190         classes
191     }
192 }
193 
194 #[cfg(test)]
195 mod tests {
196     use super::*;
197 
198     #[test]
byte_classes()199     fn byte_classes() {
200         let mut set = ByteClassBuilder::new();
201         set.set_range(b'a', b'z');
202 
203         let classes = set.build();
204         assert_eq!(classes.get(0), 0);
205         assert_eq!(classes.get(1), 0);
206         assert_eq!(classes.get(2), 0);
207         assert_eq!(classes.get(b'a' - 1), 0);
208         assert_eq!(classes.get(b'a'), 1);
209         assert_eq!(classes.get(b'm'), 1);
210         assert_eq!(classes.get(b'z'), 1);
211         assert_eq!(classes.get(b'z' + 1), 2);
212         assert_eq!(classes.get(254), 2);
213         assert_eq!(classes.get(255), 2);
214 
215         let mut set = ByteClassBuilder::new();
216         set.set_range(0, 2);
217         set.set_range(4, 6);
218         let classes = set.build();
219         assert_eq!(classes.get(0), 0);
220         assert_eq!(classes.get(1), 0);
221         assert_eq!(classes.get(2), 0);
222         assert_eq!(classes.get(3), 1);
223         assert_eq!(classes.get(4), 2);
224         assert_eq!(classes.get(5), 2);
225         assert_eq!(classes.get(6), 2);
226         assert_eq!(classes.get(7), 3);
227         assert_eq!(classes.get(255), 3);
228     }
229 
230     #[test]
full_byte_classes()231     fn full_byte_classes() {
232         let mut set = ByteClassBuilder::new();
233         for i in 0..256u16 {
234             set.set_range(i as u8, i as u8);
235         }
236         assert_eq!(set.build().alphabet_len(), 256);
237     }
238 }
239