1 use std::collections::HashMap; 2 use std::mem; 3 use std::rc::Rc; 4 5 use dense; 6 use error::Result; 7 use nfa::{self, NFA}; 8 use sparse_set::SparseSet; 9 use state_id::{dead_id, StateID}; 10 11 type DFARepr<S> = dense::Repr<Vec<S>, S>; 12 13 /// A determinizer converts an NFA to a DFA. 14 /// 15 /// This determinizer follows the typical powerset construction, where each 16 /// DFA state is comprised of one or more NFA states. In the worst case, there 17 /// is one DFA state for every possible combination of NFA states. In practice, 18 /// this only happens in certain conditions, typically when there are bounded 19 /// repetitions. 20 /// 21 /// The type variable `S` refers to the chosen state identifier representation 22 /// used for the DFA. 23 /// 24 /// The lifetime variable `'a` refers to the lifetime of the NFA being 25 /// converted to a DFA. 26 #[derive(Debug)] 27 pub(crate) struct Determinizer<'a, S: StateID> { 28 /// The NFA we're converting into a DFA. 29 nfa: &'a NFA, 30 /// The DFA we're building. 31 dfa: DFARepr<S>, 32 /// Each DFA state being built is defined as an *ordered* set of NFA 33 /// states, along with a flag indicating whether the state is a match 34 /// state or not. 35 /// 36 /// This is never empty. The first state is always a dummy state such that 37 /// a state id == 0 corresponds to a dead state. 38 builder_states: Vec<Rc<State>>, 39 /// A cache of DFA states that already exist and can be easily looked up 40 /// via ordered sets of NFA states. 41 cache: HashMap<Rc<State>, S>, 42 /// Scratch space for a stack of NFA states to visit, for depth first 43 /// visiting without recursion. 44 stack: Vec<nfa::StateID>, 45 /// Scratch space for storing an ordered sequence of NFA states, for 46 /// amortizing allocation. 47 scratch_nfa_states: Vec<nfa::StateID>, 48 /// Whether to build a DFA that finds the longest possible match. 49 longest_match: bool, 50 } 51 52 /// An intermediate representation for a DFA state during determinization. 53 #[derive(Debug, Eq, Hash, PartialEq)] 54 struct State { 55 /// Whether this state is a match state or not. 56 is_match: bool, 57 /// An ordered sequence of NFA states that make up this DFA state. 58 nfa_states: Vec<nfa::StateID>, 59 } 60 61 impl<'a, S: StateID> Determinizer<'a, S> { 62 /// Create a new determinizer for converting the given NFA to a DFA. new(nfa: &'a NFA) -> Determinizer<'a, S>63 pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { 64 let dead = Rc::new(State::dead()); 65 let mut cache = HashMap::default(); 66 cache.insert(dead.clone(), dead_id()); 67 68 Determinizer { 69 nfa, 70 dfa: DFARepr::empty().anchored(nfa.is_anchored()), 71 builder_states: vec![dead], 72 cache, 73 stack: vec![], 74 scratch_nfa_states: vec![], 75 longest_match: false, 76 } 77 } 78 79 /// Instruct the determinizer to use equivalence classes as the transition 80 /// alphabet instead of all possible byte values. with_byte_classes(mut self) -> Determinizer<'a, S>81 pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { 82 let byte_classes = self.nfa.byte_classes().clone(); 83 self.dfa = DFARepr::empty_with_byte_classes(byte_classes) 84 .anchored(self.nfa.is_anchored()); 85 self 86 } 87 88 /// Instruct the determinizer to build a DFA that recognizes the longest 89 /// possible match instead of the leftmost first match. This is useful when 90 /// constructing reverse DFAs for finding the start of a match. longest_match(mut self, yes: bool) -> Determinizer<'a, S>91 pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { 92 self.longest_match = yes; 93 self 94 } 95 96 /// Build the DFA. If there was a problem constructing the DFA (e.g., if 97 /// the chosen state identifier representation is too small), then an error 98 /// is returned. build(mut self) -> Result<DFARepr<S>>99 pub fn build(mut self) -> Result<DFARepr<S>> { 100 let representative_bytes: Vec<u8> = 101 self.dfa.byte_classes().representatives().collect(); 102 let mut sparse = self.new_sparse_set(); 103 let mut uncompiled = vec![self.add_start(&mut sparse)?]; 104 while let Some(dfa_id) = uncompiled.pop() { 105 for &b in &representative_bytes { 106 let (next_dfa_id, is_new) = 107 self.cached_state(dfa_id, b, &mut sparse)?; 108 self.dfa.add_transition(dfa_id, b, next_dfa_id); 109 if is_new { 110 uncompiled.push(next_dfa_id); 111 } 112 } 113 } 114 115 // At this point, we shuffle the matching states in the final DFA to 116 // the beginning. This permits a DFA's match loop to detect a match 117 // condition by merely inspecting the current state's identifier, and 118 // avoids the need for any additional auxiliary storage. 119 let is_match: Vec<bool> = 120 self.builder_states.iter().map(|s| s.is_match).collect(); 121 self.dfa.shuffle_match_states(&is_match); 122 Ok(self.dfa) 123 } 124 125 /// Return the identifier for the next DFA state given an existing DFA 126 /// state and an input byte. If the next DFA state already exists, then 127 /// return its identifier from the cache. Otherwise, build the state, cache 128 /// it and return its identifier. 129 /// 130 /// The given sparse set is used for scratch space. It must have a capacity 131 /// equivalent to the total number of NFA states, but its contents are 132 /// otherwise unspecified. 133 /// 134 /// This routine returns a boolean indicating whether a new state was 135 /// built. If a new state is built, then the caller needs to add it to its 136 /// frontier of uncompiled DFA states to compute transitions for. cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)>137 fn cached_state( 138 &mut self, 139 dfa_id: S, 140 b: u8, 141 sparse: &mut SparseSet, 142 ) -> Result<(S, bool)> { 143 sparse.clear(); 144 // Compute the set of all reachable NFA states, including epsilons. 145 self.next(dfa_id, b, sparse); 146 // Build a candidate state and check if it has already been built. 147 let state = self.new_state(sparse); 148 if let Some(&cached_id) = self.cache.get(&state) { 149 // Since we have a cached state, put the constructed state's 150 // memory back into our scratch space, so that it can be reused. 151 mem::replace(&mut self.scratch_nfa_states, state.nfa_states); 152 return Ok((cached_id, false)); 153 } 154 // Nothing was in the cache, so add this state to the cache. 155 self.add_state(state).map(|s| (s, true)) 156 } 157 158 /// Compute the set of all eachable NFA states, including the full epsilon 159 /// closure, from a DFA state for a single byte of input. next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet)160 fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { 161 next_nfa_states.clear(); 162 for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { 163 let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; 164 match *self.nfa.state(nfa_id) { 165 nfa::State::Union { .. } 166 | nfa::State::Fail 167 | nfa::State::Match => {} 168 nfa::State::Range { range: ref r } => { 169 if r.start <= b && b <= r.end { 170 self.epsilon_closure(r.next, next_nfa_states); 171 } 172 } 173 nfa::State::Sparse { ref ranges } => { 174 for r in ranges.iter() { 175 if r.start > b { 176 break; 177 } else if r.start <= b && b <= r.end { 178 self.epsilon_closure(r.next, next_nfa_states); 179 break; 180 } 181 } 182 } 183 } 184 } 185 } 186 187 /// Compute the epsilon closure for the given NFA state. epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet)188 fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { 189 if !self.nfa.state(start).is_epsilon() { 190 set.insert(start); 191 return; 192 } 193 194 self.stack.push(start); 195 while let Some(mut id) = self.stack.pop() { 196 loop { 197 if set.contains(id) { 198 break; 199 } 200 set.insert(id); 201 match *self.nfa.state(id) { 202 nfa::State::Range { .. } 203 | nfa::State::Sparse { .. } 204 | nfa::State::Fail 205 | nfa::State::Match => break, 206 nfa::State::Union { ref alternates } => { 207 id = match alternates.get(0) { 208 None => break, 209 Some(&id) => id, 210 }; 211 self.stack.extend(alternates[1..].iter().rev()); 212 } 213 } 214 } 215 } 216 } 217 218 /// Compute the initial DFA state and return its identifier. 219 /// 220 /// The sparse set given is used for scratch space, and must have capacity 221 /// equal to the total number of NFA states. Its contents are unspecified. add_start(&mut self, sparse: &mut SparseSet) -> Result<S>222 fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> { 223 sparse.clear(); 224 self.epsilon_closure(self.nfa.start(), sparse); 225 let state = self.new_state(&sparse); 226 let id = self.add_state(state)?; 227 self.dfa.set_start_state(id); 228 Ok(id) 229 } 230 231 /// Add the given state to the DFA and make it available in the cache. 232 /// 233 /// The state initially has no transitions. That is, it transitions to the 234 /// dead state for all possible inputs. add_state(&mut self, state: State) -> Result<S>235 fn add_state(&mut self, state: State) -> Result<S> { 236 let id = self.dfa.add_empty_state()?; 237 let rstate = Rc::new(state); 238 self.builder_states.push(rstate.clone()); 239 self.cache.insert(rstate, id); 240 Ok(id) 241 } 242 243 /// Convert the given set of ordered NFA states to a DFA state. new_state(&mut self, set: &SparseSet) -> State244 fn new_state(&mut self, set: &SparseSet) -> State { 245 let mut state = State { 246 is_match: false, 247 nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), 248 }; 249 state.nfa_states.clear(); 250 251 for &id in set { 252 match *self.nfa.state(id) { 253 nfa::State::Range { .. } => { 254 state.nfa_states.push(id); 255 } 256 nfa::State::Sparse { .. } => { 257 state.nfa_states.push(id); 258 } 259 nfa::State::Fail => { 260 break; 261 } 262 nfa::State::Match => { 263 state.is_match = true; 264 if !self.longest_match { 265 break; 266 } 267 } 268 nfa::State::Union { .. } => {} 269 } 270 } 271 state 272 } 273 274 /// Create a new sparse set with enough capacity to hold all NFA states. new_sparse_set(&self) -> SparseSet275 fn new_sparse_set(&self) -> SparseSet { 276 SparseSet::new(self.nfa.len()) 277 } 278 } 279 280 impl State { 281 /// Create a new empty dead state. dead() -> State282 fn dead() -> State { 283 State { nfa_states: vec![], is_match: false } 284 } 285 } 286