• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::collections::HashMap;
2 use std::mem;
3 use std::rc::Rc;
4 
5 use dense;
6 use error::Result;
7 use nfa::{self, NFA};
8 use sparse_set::SparseSet;
9 use state_id::{dead_id, StateID};
10 
11 type DFARepr<S> = dense::Repr<Vec<S>, S>;
12 
13 /// A determinizer converts an NFA to a DFA.
14 ///
15 /// This determinizer follows the typical powerset construction, where each
16 /// DFA state is comprised of one or more NFA states. In the worst case, there
17 /// is one DFA state for every possible combination of NFA states. In practice,
18 /// this only happens in certain conditions, typically when there are bounded
19 /// repetitions.
20 ///
21 /// The type variable `S` refers to the chosen state identifier representation
22 /// used for the DFA.
23 ///
24 /// The lifetime variable `'a` refers to the lifetime of the NFA being
25 /// converted to a DFA.
26 #[derive(Debug)]
27 pub(crate) struct Determinizer<'a, S: StateID> {
28     /// The NFA we're converting into a DFA.
29     nfa: &'a NFA,
30     /// The DFA we're building.
31     dfa: DFARepr<S>,
32     /// Each DFA state being built is defined as an *ordered* set of NFA
33     /// states, along with a flag indicating whether the state is a match
34     /// state or not.
35     ///
36     /// This is never empty. The first state is always a dummy state such that
37     /// a state id == 0 corresponds to a dead state.
38     builder_states: Vec<Rc<State>>,
39     /// A cache of DFA states that already exist and can be easily looked up
40     /// via ordered sets of NFA states.
41     cache: HashMap<Rc<State>, S>,
42     /// Scratch space for a stack of NFA states to visit, for depth first
43     /// visiting without recursion.
44     stack: Vec<nfa::StateID>,
45     /// Scratch space for storing an ordered sequence of NFA states, for
46     /// amortizing allocation.
47     scratch_nfa_states: Vec<nfa::StateID>,
48     /// Whether to build a DFA that finds the longest possible match.
49     longest_match: bool,
50 }
51 
52 /// An intermediate representation for a DFA state during determinization.
53 #[derive(Debug, Eq, Hash, PartialEq)]
54 struct State {
55     /// Whether this state is a match state or not.
56     is_match: bool,
57     /// An ordered sequence of NFA states that make up this DFA state.
58     nfa_states: Vec<nfa::StateID>,
59 }
60 
61 impl<'a, S: StateID> Determinizer<'a, S> {
62     /// Create a new determinizer for converting the given NFA to a DFA.
new(nfa: &'a NFA) -> Determinizer<'a, S>63     pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
64         let dead = Rc::new(State::dead());
65         let mut cache = HashMap::default();
66         cache.insert(dead.clone(), dead_id());
67 
68         Determinizer {
69             nfa,
70             dfa: DFARepr::empty().anchored(nfa.is_anchored()),
71             builder_states: vec![dead],
72             cache,
73             stack: vec![],
74             scratch_nfa_states: vec![],
75             longest_match: false,
76         }
77     }
78 
79     /// Instruct the determinizer to use equivalence classes as the transition
80     /// alphabet instead of all possible byte values.
with_byte_classes(mut self) -> Determinizer<'a, S>81     pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
82         let byte_classes = self.nfa.byte_classes().clone();
83         self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
84             .anchored(self.nfa.is_anchored());
85         self
86     }
87 
88     /// Instruct the determinizer to build a DFA that recognizes the longest
89     /// possible match instead of the leftmost first match. This is useful when
90     /// constructing reverse DFAs for finding the start of a match.
longest_match(mut self, yes: bool) -> Determinizer<'a, S>91     pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
92         self.longest_match = yes;
93         self
94     }
95 
96     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
97     /// the chosen state identifier representation is too small), then an error
98     /// is returned.
build(mut self) -> Result<DFARepr<S>>99     pub fn build(mut self) -> Result<DFARepr<S>> {
100         let representative_bytes: Vec<u8> =
101             self.dfa.byte_classes().representatives().collect();
102         let mut sparse = self.new_sparse_set();
103         let mut uncompiled = vec![self.add_start(&mut sparse)?];
104         while let Some(dfa_id) = uncompiled.pop() {
105             for &b in &representative_bytes {
106                 let (next_dfa_id, is_new) =
107                     self.cached_state(dfa_id, b, &mut sparse)?;
108                 self.dfa.add_transition(dfa_id, b, next_dfa_id);
109                 if is_new {
110                     uncompiled.push(next_dfa_id);
111                 }
112             }
113         }
114 
115         // At this point, we shuffle the matching states in the final DFA to
116         // the beginning. This permits a DFA's match loop to detect a match
117         // condition by merely inspecting the current state's identifier, and
118         // avoids the need for any additional auxiliary storage.
119         let is_match: Vec<bool> =
120             self.builder_states.iter().map(|s| s.is_match).collect();
121         self.dfa.shuffle_match_states(&is_match);
122         Ok(self.dfa)
123     }
124 
125     /// Return the identifier for the next DFA state given an existing DFA
126     /// state and an input byte. If the next DFA state already exists, then
127     /// return its identifier from the cache. Otherwise, build the state, cache
128     /// it and return its identifier.
129     ///
130     /// The given sparse set is used for scratch space. It must have a capacity
131     /// equivalent to the total number of NFA states, but its contents are
132     /// otherwise unspecified.
133     ///
134     /// This routine returns a boolean indicating whether a new state was
135     /// built. If a new state is built, then the caller needs to add it to its
136     /// frontier of uncompiled DFA states to compute transitions for.
cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)>137     fn cached_state(
138         &mut self,
139         dfa_id: S,
140         b: u8,
141         sparse: &mut SparseSet,
142     ) -> Result<(S, bool)> {
143         sparse.clear();
144         // Compute the set of all reachable NFA states, including epsilons.
145         self.next(dfa_id, b, sparse);
146         // Build a candidate state and check if it has already been built.
147         let state = self.new_state(sparse);
148         if let Some(&cached_id) = self.cache.get(&state) {
149             // Since we have a cached state, put the constructed state's
150             // memory back into our scratch space, so that it can be reused.
151             mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
152             return Ok((cached_id, false));
153         }
154         // Nothing was in the cache, so add this state to the cache.
155         self.add_state(state).map(|s| (s, true))
156     }
157 
158     /// Compute the set of all eachable NFA states, including the full epsilon
159     /// closure, from a DFA state for a single byte of input.
next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet)160     fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
161         next_nfa_states.clear();
162         for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
163             let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
164             match *self.nfa.state(nfa_id) {
165                 nfa::State::Union { .. }
166                 | nfa::State::Fail
167                 | nfa::State::Match => {}
168                 nfa::State::Range { range: ref r } => {
169                     if r.start <= b && b <= r.end {
170                         self.epsilon_closure(r.next, next_nfa_states);
171                     }
172                 }
173                 nfa::State::Sparse { ref ranges } => {
174                     for r in ranges.iter() {
175                         if r.start > b {
176                             break;
177                         } else if r.start <= b && b <= r.end {
178                             self.epsilon_closure(r.next, next_nfa_states);
179                             break;
180                         }
181                     }
182                 }
183             }
184         }
185     }
186 
187     /// Compute the epsilon closure for the given NFA state.
epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet)188     fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
189         if !self.nfa.state(start).is_epsilon() {
190             set.insert(start);
191             return;
192         }
193 
194         self.stack.push(start);
195         while let Some(mut id) = self.stack.pop() {
196             loop {
197                 if set.contains(id) {
198                     break;
199                 }
200                 set.insert(id);
201                 match *self.nfa.state(id) {
202                     nfa::State::Range { .. }
203                     | nfa::State::Sparse { .. }
204                     | nfa::State::Fail
205                     | nfa::State::Match => break,
206                     nfa::State::Union { ref alternates } => {
207                         id = match alternates.get(0) {
208                             None => break,
209                             Some(&id) => id,
210                         };
211                         self.stack.extend(alternates[1..].iter().rev());
212                     }
213                 }
214             }
215         }
216     }
217 
218     /// Compute the initial DFA state and return its identifier.
219     ///
220     /// The sparse set given is used for scratch space, and must have capacity
221     /// equal to the total number of NFA states. Its contents are unspecified.
add_start(&mut self, sparse: &mut SparseSet) -> Result<S>222     fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
223         sparse.clear();
224         self.epsilon_closure(self.nfa.start(), sparse);
225         let state = self.new_state(&sparse);
226         let id = self.add_state(state)?;
227         self.dfa.set_start_state(id);
228         Ok(id)
229     }
230 
231     /// Add the given state to the DFA and make it available in the cache.
232     ///
233     /// The state initially has no transitions. That is, it transitions to the
234     /// dead state for all possible inputs.
add_state(&mut self, state: State) -> Result<S>235     fn add_state(&mut self, state: State) -> Result<S> {
236         let id = self.dfa.add_empty_state()?;
237         let rstate = Rc::new(state);
238         self.builder_states.push(rstate.clone());
239         self.cache.insert(rstate, id);
240         Ok(id)
241     }
242 
243     /// Convert the given set of ordered NFA states to a DFA state.
new_state(&mut self, set: &SparseSet) -> State244     fn new_state(&mut self, set: &SparseSet) -> State {
245         let mut state = State {
246             is_match: false,
247             nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
248         };
249         state.nfa_states.clear();
250 
251         for &id in set {
252             match *self.nfa.state(id) {
253                 nfa::State::Range { .. } => {
254                     state.nfa_states.push(id);
255                 }
256                 nfa::State::Sparse { .. } => {
257                     state.nfa_states.push(id);
258                 }
259                 nfa::State::Fail => {
260                     break;
261                 }
262                 nfa::State::Match => {
263                     state.is_match = true;
264                     if !self.longest_match {
265                         break;
266                     }
267                 }
268                 nfa::State::Union { .. } => {}
269             }
270         }
271         state
272     }
273 
274     /// Create a new sparse set with enough capacity to hold all NFA states.
new_sparse_set(&self) -> SparseSet275     fn new_sparse_set(&self) -> SparseSet {
276         SparseSet::new(self.nfa.len())
277     }
278 }
279 
280 impl State {
281     /// Create a new empty dead state.
dead() -> State282     fn dead() -> State {
283         State { nfa_states: vec![], is_match: false }
284     }
285 }
286