1 use std::cell::RefCell; 2 use std::fmt; 3 use std::mem; 4 use std::rc::Rc; 5 6 use dense; 7 use state_id::{dead_id, StateID}; 8 9 type DFARepr<S> = dense::Repr<Vec<S>, S>; 10 11 /// An implementation of Hopcroft's algorithm for minimizing DFAs. 12 /// 13 /// The algorithm implemented here is mostly taken from Wikipedia: 14 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm 15 /// 16 /// This code has had some light optimization attention paid to it, 17 /// particularly in the form of reducing allocation as much as possible. 18 /// However, it is still generally slow. Future optimization work should 19 /// probably focus on the bigger picture rather than micro-optimizations. For 20 /// example: 21 /// 22 /// 1. Figure out how to more intelligently create initial partitions. That is, 23 /// Hopcroft's algorithm starts by creating two partitions of DFA states 24 /// that are known to NOT be equivalent: match states and non-match states. 25 /// The algorithm proceeds by progressively refining these partitions into 26 /// smaller partitions. If we could start with more partitions, then we 27 /// could reduce the amount of work that Hopcroft's algorithm needs to do. 28 /// 2. For every partition that we visit, we find all incoming transitions to 29 /// every state in the partition for *every* element in the alphabet. (This 30 /// is why using byte classes can significantly decrease minimization times, 31 /// since byte classes shrink the alphabet.) This is quite costly and there 32 /// is perhaps some redundant work being performed depending on the specific 33 /// states in the set. For example, we might be able to only visit some 34 /// elements of the alphabet based on the transitions. 35 /// 3. Move parts of minimization into determinization. If minimization has 36 /// fewer states to deal with, then it should run faster. A prime example 37 /// of this might be large Unicode classes, which are generated in way that 38 /// can create a lot of redundant states. (Some work has been done on this 39 /// point during NFA compilation via the algorithm described in the 40 /// "Incremental Construction of MinimalAcyclic Finite-State Automata" 41 /// paper.) 42 pub(crate) struct Minimizer<'a, S: 'a> { 43 dfa: &'a mut DFARepr<S>, 44 in_transitions: Vec<Vec<Vec<S>>>, 45 partitions: Vec<StateSet<S>>, 46 waiting: Vec<StateSet<S>>, 47 } 48 49 impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result50 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 51 f.debug_struct("Minimizer") 52 .field("dfa", &self.dfa) 53 .field("in_transitions", &self.in_transitions) 54 .field("partitions", &self.partitions) 55 .field("waiting", &self.waiting) 56 .finish() 57 } 58 } 59 60 /// A set of states. A state set makes up a single partition in Hopcroft's 61 /// algorithm. 62 /// 63 /// It is represented by an ordered set of state identifiers. We use shared 64 /// ownership so that a single state set can be in both the set of partitions 65 /// and in the set of waiting sets simultaneously without an additional 66 /// allocation. Generally, once a state set is built, it becomes immutable. 67 /// 68 /// We use this representation because it avoids the overhead of more 69 /// traditional set data structures (HashSet/BTreeSet), and also because 70 /// computing intersection/subtraction on this representation is especially 71 /// fast. 72 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] 73 struct StateSet<S>(Rc<RefCell<Vec<S>>>); 74 75 impl<'a, S: StateID> Minimizer<'a, S> { new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S>76 pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> { 77 let in_transitions = Minimizer::incoming_transitions(dfa); 78 let partitions = Minimizer::initial_partitions(dfa); 79 let waiting = vec![partitions[0].clone()]; 80 81 Minimizer { dfa, in_transitions, partitions, waiting } 82 } 83 run(mut self)84 pub fn run(mut self) { 85 let mut incoming = StateSet::empty(); 86 let mut scratch1 = StateSet::empty(); 87 let mut scratch2 = StateSet::empty(); 88 let mut newparts = vec![]; 89 90 while let Some(set) = self.waiting.pop() { 91 for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) { 92 self.find_incoming_to(b, &set, &mut incoming); 93 94 for p in 0..self.partitions.len() { 95 self.partitions[p].intersection(&incoming, &mut scratch1); 96 if scratch1.is_empty() { 97 newparts.push(self.partitions[p].clone()); 98 continue; 99 } 100 101 self.partitions[p].subtract(&incoming, &mut scratch2); 102 if scratch2.is_empty() { 103 newparts.push(self.partitions[p].clone()); 104 continue; 105 } 106 107 let (x, y) = 108 (scratch1.deep_clone(), scratch2.deep_clone()); 109 newparts.push(x.clone()); 110 newparts.push(y.clone()); 111 match self.find_waiting(&self.partitions[p]) { 112 Some(i) => { 113 self.waiting[i] = x; 114 self.waiting.push(y); 115 } 116 None => { 117 if x.len() <= y.len() { 118 self.waiting.push(x); 119 } else { 120 self.waiting.push(y); 121 } 122 } 123 } 124 } 125 newparts = mem::replace(&mut self.partitions, newparts); 126 newparts.clear(); 127 } 128 } 129 130 // At this point, we now have a minimal partitioning of states, where 131 // each partition is an equivalence class of DFA states. Now we need to 132 // use this partioning to update the DFA to only contain one state for 133 // each partition. 134 135 // Create a map from DFA state ID to the representative ID of the 136 // equivalence class to which it belongs. The representative ID of an 137 // equivalence class of states is the minimum ID in that class. 138 let mut state_to_part = vec![dead_id(); self.dfa.state_count()]; 139 for p in &self.partitions { 140 p.iter(|id| state_to_part[id.to_usize()] = p.min()); 141 } 142 143 // Generate a new contiguous sequence of IDs for minimal states, and 144 // create a map from equivalence IDs to the new IDs. Thus, the new 145 // minimal ID of *any* state in the unminimized DFA can be obtained 146 // with minimals_ids[state_to_part[old_id]]. 147 let mut minimal_ids = vec![dead_id(); self.dfa.state_count()]; 148 let mut new_id = S::from_usize(0); 149 for (id, _) in self.dfa.states() { 150 if state_to_part[id.to_usize()] == id { 151 minimal_ids[id.to_usize()] = new_id; 152 new_id = S::from_usize(new_id.to_usize() + 1); 153 } 154 } 155 // The total number of states in the minimal DFA. 156 let minimal_count = new_id.to_usize(); 157 158 // Re-map this DFA in place such that the only states remaining 159 // correspond to the representative states of every equivalence class. 160 for id in (0..self.dfa.state_count()).map(S::from_usize) { 161 // If this state isn't a representative for an equivalence class, 162 // then we skip it since it won't appear in the minimal DFA. 163 if state_to_part[id.to_usize()] != id { 164 continue; 165 } 166 for (_, next) in self.dfa.get_state_mut(id).iter_mut() { 167 *next = minimal_ids[state_to_part[next.to_usize()].to_usize()]; 168 } 169 self.dfa.swap_states(id, minimal_ids[id.to_usize()]); 170 } 171 // Trim off all unused states from the pre-minimized DFA. This 172 // represents all states that were merged into a non-singleton 173 // equivalence class of states, and appeared after the first state 174 // in each such class. (Because the state with the smallest ID in each 175 // equivalence class is its representative ID.) 176 self.dfa.truncate_states(minimal_count); 177 178 // Update the new start state, which is now just the minimal ID of 179 // whatever state the old start state was collapsed into. 180 let old_start = self.dfa.start_state(); 181 self.dfa.set_start_state( 182 minimal_ids[state_to_part[old_start.to_usize()].to_usize()], 183 ); 184 185 // In order to update the ID of the maximum match state, we need to 186 // find the maximum ID among all of the match states in the minimized 187 // DFA. This is not necessarily the new ID of the unminimized maximum 188 // match state, since that could have been collapsed with a much 189 // earlier match state. Therefore, to find the new max match state, 190 // we iterate over all previous match states, find their corresponding 191 // new minimal ID, and take the maximum of those. 192 let old_max = self.dfa.max_match_state(); 193 self.dfa.set_max_match_state(dead_id()); 194 for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) { 195 let part = state_to_part[id.to_usize()]; 196 let new_id = minimal_ids[part.to_usize()]; 197 if new_id > self.dfa.max_match_state() { 198 self.dfa.set_max_match_state(new_id); 199 } 200 } 201 } 202 find_waiting(&self, set: &StateSet<S>) -> Option<usize>203 fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> { 204 self.waiting.iter().position(|s| s == set) 205 } 206 find_incoming_to( &self, b: u8, set: &StateSet<S>, incoming: &mut StateSet<S>, )207 fn find_incoming_to( 208 &self, 209 b: u8, 210 set: &StateSet<S>, 211 incoming: &mut StateSet<S>, 212 ) { 213 incoming.clear(); 214 set.iter(|id| { 215 for &inid in &self.in_transitions[id.to_usize()][b as usize] { 216 incoming.add(inid); 217 } 218 }); 219 incoming.canonicalize(); 220 } 221 initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>>222 fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> { 223 let mut is_match = StateSet::empty(); 224 let mut no_match = StateSet::empty(); 225 for (id, _) in dfa.states() { 226 if dfa.is_match_state(id) { 227 is_match.add(id); 228 } else { 229 no_match.add(id); 230 } 231 } 232 233 let mut sets = vec![is_match]; 234 if !no_match.is_empty() { 235 sets.push(no_match); 236 } 237 sets.sort_by_key(|s| s.len()); 238 sets 239 } 240 incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>>241 fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> { 242 let mut incoming = vec![]; 243 for _ in dfa.states() { 244 incoming.push(vec![vec![]; dfa.alphabet_len()]); 245 } 246 for (id, state) in dfa.states() { 247 for (b, next) in state.transitions() { 248 incoming[next.to_usize()][b as usize].push(id); 249 } 250 } 251 incoming 252 } 253 } 254 255 impl<S: StateID> StateSet<S> { empty() -> StateSet<S>256 fn empty() -> StateSet<S> { 257 StateSet(Rc::new(RefCell::new(vec![]))) 258 } 259 add(&mut self, id: S)260 fn add(&mut self, id: S) { 261 self.0.borrow_mut().push(id); 262 } 263 min(&self) -> S264 fn min(&self) -> S { 265 self.0.borrow()[0] 266 } 267 canonicalize(&mut self)268 fn canonicalize(&mut self) { 269 self.0.borrow_mut().sort(); 270 self.0.borrow_mut().dedup(); 271 } 272 clear(&mut self)273 fn clear(&mut self) { 274 self.0.borrow_mut().clear(); 275 } 276 len(&self) -> usize277 fn len(&self) -> usize { 278 self.0.borrow().len() 279 } 280 is_empty(&self) -> bool281 fn is_empty(&self) -> bool { 282 self.len() == 0 283 } 284 deep_clone(&self) -> StateSet<S>285 fn deep_clone(&self) -> StateSet<S> { 286 let ids = self.0.borrow().iter().cloned().collect(); 287 StateSet(Rc::new(RefCell::new(ids))) 288 } 289 iter<F: FnMut(S)>(&self, mut f: F)290 fn iter<F: FnMut(S)>(&self, mut f: F) { 291 for &id in self.0.borrow().iter() { 292 f(id); 293 } 294 } 295 intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>)296 fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { 297 dest.clear(); 298 if self.is_empty() || other.is_empty() { 299 return; 300 } 301 302 let (seta, setb) = (self.0.borrow(), other.0.borrow()); 303 let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); 304 let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); 305 loop { 306 if a == b { 307 dest.add(a); 308 a = match ita.next() { 309 None => break, 310 Some(a) => a, 311 }; 312 b = match itb.next() { 313 None => break, 314 Some(b) => b, 315 }; 316 } else if a < b { 317 a = match ita.next() { 318 None => break, 319 Some(a) => a, 320 }; 321 } else { 322 b = match itb.next() { 323 None => break, 324 Some(b) => b, 325 }; 326 } 327 } 328 } 329 subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>)330 fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { 331 dest.clear(); 332 if self.is_empty() || other.is_empty() { 333 self.iter(|s| dest.add(s)); 334 return; 335 } 336 337 let (seta, setb) = (self.0.borrow(), other.0.borrow()); 338 let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); 339 let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); 340 loop { 341 if a == b { 342 a = match ita.next() { 343 None => break, 344 Some(a) => a, 345 }; 346 b = match itb.next() { 347 None => { 348 dest.add(a); 349 break; 350 } 351 Some(b) => b, 352 }; 353 } else if a < b { 354 dest.add(a); 355 a = match ita.next() { 356 None => break, 357 Some(a) => a, 358 }; 359 } else { 360 b = match itb.next() { 361 None => { 362 dest.add(a); 363 break; 364 } 365 Some(b) => b, 366 }; 367 } 368 } 369 for a in ita { 370 dest.add(a); 371 } 372 } 373 } 374