• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::cell::RefCell;
2 use std::fmt;
3 use std::mem;
4 use std::rc::Rc;
5 
6 use dense;
7 use state_id::{dead_id, StateID};
8 
9 type DFARepr<S> = dense::Repr<Vec<S>, S>;
10 
11 /// An implementation of Hopcroft's algorithm for minimizing DFAs.
12 ///
13 /// The algorithm implemented here is mostly taken from Wikipedia:
14 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
15 ///
16 /// This code has had some light optimization attention paid to it,
17 /// particularly in the form of reducing allocation as much as possible.
18 /// However, it is still generally slow. Future optimization work should
19 /// probably focus on the bigger picture rather than micro-optimizations. For
20 /// example:
21 ///
22 /// 1. Figure out how to more intelligently create initial partitions. That is,
23 ///    Hopcroft's algorithm starts by creating two partitions of DFA states
24 ///    that are known to NOT be equivalent: match states and non-match states.
25 ///    The algorithm proceeds by progressively refining these partitions into
26 ///    smaller partitions. If we could start with more partitions, then we
27 ///    could reduce the amount of work that Hopcroft's algorithm needs to do.
28 /// 2. For every partition that we visit, we find all incoming transitions to
29 ///    every state in the partition for *every* element in the alphabet. (This
30 ///    is why using byte classes can significantly decrease minimization times,
31 ///    since byte classes shrink the alphabet.) This is quite costly and there
32 ///    is perhaps some redundant work being performed depending on the specific
33 ///    states in the set. For example, we might be able to only visit some
34 ///    elements of the alphabet based on the transitions.
35 /// 3. Move parts of minimization into determinization. If minimization has
36 ///    fewer states to deal with, then it should run faster. A prime example
37 ///    of this might be large Unicode classes, which are generated in way that
38 ///    can create a lot of redundant states. (Some work has been done on this
39 ///    point during NFA compilation via the algorithm described in the
40 ///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
41 ///    paper.)
42 pub(crate) struct Minimizer<'a, S: 'a> {
43     dfa: &'a mut DFARepr<S>,
44     in_transitions: Vec<Vec<Vec<S>>>,
45     partitions: Vec<StateSet<S>>,
46     waiting: Vec<StateSet<S>>,
47 }
48 
49 impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result50     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
51         f.debug_struct("Minimizer")
52             .field("dfa", &self.dfa)
53             .field("in_transitions", &self.in_transitions)
54             .field("partitions", &self.partitions)
55             .field("waiting", &self.waiting)
56             .finish()
57     }
58 }
59 
60 /// A set of states. A state set makes up a single partition in Hopcroft's
61 /// algorithm.
62 ///
63 /// It is represented by an ordered set of state identifiers. We use shared
64 /// ownership so that a single state set can be in both the set of partitions
65 /// and in the set of waiting sets simultaneously without an additional
66 /// allocation. Generally, once a state set is built, it becomes immutable.
67 ///
68 /// We use this representation because it avoids the overhead of more
69 /// traditional set data structures (HashSet/BTreeSet), and also because
70 /// computing intersection/subtraction on this representation is especially
71 /// fast.
72 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
73 struct StateSet<S>(Rc<RefCell<Vec<S>>>);
74 
75 impl<'a, S: StateID> Minimizer<'a, S> {
new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S>76     pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
77         let in_transitions = Minimizer::incoming_transitions(dfa);
78         let partitions = Minimizer::initial_partitions(dfa);
79         let waiting = vec![partitions[0].clone()];
80 
81         Minimizer { dfa, in_transitions, partitions, waiting }
82     }
83 
run(mut self)84     pub fn run(mut self) {
85         let mut incoming = StateSet::empty();
86         let mut scratch1 = StateSet::empty();
87         let mut scratch2 = StateSet::empty();
88         let mut newparts = vec![];
89 
90         while let Some(set) = self.waiting.pop() {
91             for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
92                 self.find_incoming_to(b, &set, &mut incoming);
93 
94                 for p in 0..self.partitions.len() {
95                     self.partitions[p].intersection(&incoming, &mut scratch1);
96                     if scratch1.is_empty() {
97                         newparts.push(self.partitions[p].clone());
98                         continue;
99                     }
100 
101                     self.partitions[p].subtract(&incoming, &mut scratch2);
102                     if scratch2.is_empty() {
103                         newparts.push(self.partitions[p].clone());
104                         continue;
105                     }
106 
107                     let (x, y) =
108                         (scratch1.deep_clone(), scratch2.deep_clone());
109                     newparts.push(x.clone());
110                     newparts.push(y.clone());
111                     match self.find_waiting(&self.partitions[p]) {
112                         Some(i) => {
113                             self.waiting[i] = x;
114                             self.waiting.push(y);
115                         }
116                         None => {
117                             if x.len() <= y.len() {
118                                 self.waiting.push(x);
119                             } else {
120                                 self.waiting.push(y);
121                             }
122                         }
123                     }
124                 }
125                 newparts = mem::replace(&mut self.partitions, newparts);
126                 newparts.clear();
127             }
128         }
129 
130         // At this point, we now have a minimal partitioning of states, where
131         // each partition is an equivalence class of DFA states. Now we need to
132         // use this partioning to update the DFA to only contain one state for
133         // each partition.
134 
135         // Create a map from DFA state ID to the representative ID of the
136         // equivalence class to which it belongs. The representative ID of an
137         // equivalence class of states is the minimum ID in that class.
138         let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
139         for p in &self.partitions {
140             p.iter(|id| state_to_part[id.to_usize()] = p.min());
141         }
142 
143         // Generate a new contiguous sequence of IDs for minimal states, and
144         // create a map from equivalence IDs to the new IDs. Thus, the new
145         // minimal ID of *any* state in the unminimized DFA can be obtained
146         // with minimals_ids[state_to_part[old_id]].
147         let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
148         let mut new_id = S::from_usize(0);
149         for (id, _) in self.dfa.states() {
150             if state_to_part[id.to_usize()] == id {
151                 minimal_ids[id.to_usize()] = new_id;
152                 new_id = S::from_usize(new_id.to_usize() + 1);
153             }
154         }
155         // The total number of states in the minimal DFA.
156         let minimal_count = new_id.to_usize();
157 
158         // Re-map this DFA in place such that the only states remaining
159         // correspond to the representative states of every equivalence class.
160         for id in (0..self.dfa.state_count()).map(S::from_usize) {
161             // If this state isn't a representative for an equivalence class,
162             // then we skip it since it won't appear in the minimal DFA.
163             if state_to_part[id.to_usize()] != id {
164                 continue;
165             }
166             for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
167                 *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
168             }
169             self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
170         }
171         // Trim off all unused states from the pre-minimized DFA. This
172         // represents all states that were merged into a non-singleton
173         // equivalence class of states, and appeared after the first state
174         // in each such class. (Because the state with the smallest ID in each
175         // equivalence class is its representative ID.)
176         self.dfa.truncate_states(minimal_count);
177 
178         // Update the new start state, which is now just the minimal ID of
179         // whatever state the old start state was collapsed into.
180         let old_start = self.dfa.start_state();
181         self.dfa.set_start_state(
182             minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
183         );
184 
185         // In order to update the ID of the maximum match state, we need to
186         // find the maximum ID among all of the match states in the minimized
187         // DFA. This is not necessarily the new ID of the unminimized maximum
188         // match state, since that could have been collapsed with a much
189         // earlier match state. Therefore, to find the new max match state,
190         // we iterate over all previous match states, find their corresponding
191         // new minimal ID, and take the maximum of those.
192         let old_max = self.dfa.max_match_state();
193         self.dfa.set_max_match_state(dead_id());
194         for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
195             let part = state_to_part[id.to_usize()];
196             let new_id = minimal_ids[part.to_usize()];
197             if new_id > self.dfa.max_match_state() {
198                 self.dfa.set_max_match_state(new_id);
199             }
200         }
201     }
202 
find_waiting(&self, set: &StateSet<S>) -> Option<usize>203     fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
204         self.waiting.iter().position(|s| s == set)
205     }
206 
find_incoming_to( &self, b: u8, set: &StateSet<S>, incoming: &mut StateSet<S>, )207     fn find_incoming_to(
208         &self,
209         b: u8,
210         set: &StateSet<S>,
211         incoming: &mut StateSet<S>,
212     ) {
213         incoming.clear();
214         set.iter(|id| {
215             for &inid in &self.in_transitions[id.to_usize()][b as usize] {
216                 incoming.add(inid);
217             }
218         });
219         incoming.canonicalize();
220     }
221 
initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>>222     fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
223         let mut is_match = StateSet::empty();
224         let mut no_match = StateSet::empty();
225         for (id, _) in dfa.states() {
226             if dfa.is_match_state(id) {
227                 is_match.add(id);
228             } else {
229                 no_match.add(id);
230             }
231         }
232 
233         let mut sets = vec![is_match];
234         if !no_match.is_empty() {
235             sets.push(no_match);
236         }
237         sets.sort_by_key(|s| s.len());
238         sets
239     }
240 
incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>>241     fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
242         let mut incoming = vec![];
243         for _ in dfa.states() {
244             incoming.push(vec![vec![]; dfa.alphabet_len()]);
245         }
246         for (id, state) in dfa.states() {
247             for (b, next) in state.transitions() {
248                 incoming[next.to_usize()][b as usize].push(id);
249             }
250         }
251         incoming
252     }
253 }
254 
255 impl<S: StateID> StateSet<S> {
empty() -> StateSet<S>256     fn empty() -> StateSet<S> {
257         StateSet(Rc::new(RefCell::new(vec![])))
258     }
259 
add(&mut self, id: S)260     fn add(&mut self, id: S) {
261         self.0.borrow_mut().push(id);
262     }
263 
min(&self) -> S264     fn min(&self) -> S {
265         self.0.borrow()[0]
266     }
267 
canonicalize(&mut self)268     fn canonicalize(&mut self) {
269         self.0.borrow_mut().sort();
270         self.0.borrow_mut().dedup();
271     }
272 
clear(&mut self)273     fn clear(&mut self) {
274         self.0.borrow_mut().clear();
275     }
276 
len(&self) -> usize277     fn len(&self) -> usize {
278         self.0.borrow().len()
279     }
280 
is_empty(&self) -> bool281     fn is_empty(&self) -> bool {
282         self.len() == 0
283     }
284 
deep_clone(&self) -> StateSet<S>285     fn deep_clone(&self) -> StateSet<S> {
286         let ids = self.0.borrow().iter().cloned().collect();
287         StateSet(Rc::new(RefCell::new(ids)))
288     }
289 
iter<F: FnMut(S)>(&self, mut f: F)290     fn iter<F: FnMut(S)>(&self, mut f: F) {
291         for &id in self.0.borrow().iter() {
292             f(id);
293         }
294     }
295 
intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>)296     fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
297         dest.clear();
298         if self.is_empty() || other.is_empty() {
299             return;
300         }
301 
302         let (seta, setb) = (self.0.borrow(), other.0.borrow());
303         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
304         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
305         loop {
306             if a == b {
307                 dest.add(a);
308                 a = match ita.next() {
309                     None => break,
310                     Some(a) => a,
311                 };
312                 b = match itb.next() {
313                     None => break,
314                     Some(b) => b,
315                 };
316             } else if a < b {
317                 a = match ita.next() {
318                     None => break,
319                     Some(a) => a,
320                 };
321             } else {
322                 b = match itb.next() {
323                     None => break,
324                     Some(b) => b,
325                 };
326             }
327         }
328     }
329 
subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>)330     fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
331         dest.clear();
332         if self.is_empty() || other.is_empty() {
333             self.iter(|s| dest.add(s));
334             return;
335         }
336 
337         let (seta, setb) = (self.0.borrow(), other.0.borrow());
338         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
339         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
340         loop {
341             if a == b {
342                 a = match ita.next() {
343                     None => break,
344                     Some(a) => a,
345                 };
346                 b = match itb.next() {
347                     None => {
348                         dest.add(a);
349                         break;
350                     }
351                     Some(b) => b,
352                 };
353             } else if a < b {
354                 dest.add(a);
355                 a = match ita.next() {
356                     None => break,
357                     Some(a) => a,
358                 };
359             } else {
360                 b = match itb.next() {
361                     None => {
362                         dest.add(a);
363                         break;
364                     }
365                     Some(b) => b,
366                 };
367             }
368         }
369         for a in ita {
370             dest.add(a);
371         }
372     }
373 }
374