1 // This module implements the Pike VM. That is, it guarantees linear time 2 // search of a regex on any text with memory use proportional to the size of 3 // the regex. 4 // 5 // It is equal in power to the backtracking engine in this crate, except the 6 // backtracking engine is typically faster on small regexes/texts at the 7 // expense of a bigger memory footprint. 8 // 9 // It can do more than the DFA can (specifically, record capture locations 10 // and execute Unicode word boundary assertions), but at a slower speed. 11 // Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding 12 // epsilon transitions. That is, the Pike VM engine can be in multiple states 13 // at once where as the DFA is only ever in one state at a time. 14 // 15 // Therefore, the Pike VM is generally treated as the fallback when the other 16 // matching engines either aren't feasible to run or are insufficient. 17 18 use std::mem; 19 20 use exec::ProgramCache; 21 use input::{Input, InputAt}; 22 use prog::{InstPtr, Program}; 23 use re_trait::Slot; 24 use sparse::SparseSet; 25 26 /// An NFA simulation matching engine. 27 #[derive(Debug)] 28 pub struct Fsm<'r, I> { 29 /// The sequence of opcodes (among other things) that is actually executed. 30 /// 31 /// The program may be byte oriented or Unicode codepoint oriented. 32 prog: &'r Program, 33 /// An explicit stack used for following epsilon transitions. (This is 34 /// borrowed from the cache.) 35 stack: &'r mut Vec<FollowEpsilon>, 36 /// The input to search. 37 input: I, 38 } 39 40 /// A cached allocation that can be reused on each execution. 41 #[derive(Clone, Debug)] 42 pub struct Cache { 43 /// A pair of ordered sets for tracking NFA states. 44 clist: Threads, 45 nlist: Threads, 46 /// An explicit stack used for following epsilon transitions. 47 stack: Vec<FollowEpsilon>, 48 } 49 50 /// An ordered set of NFA states and their captures. 51 #[derive(Clone, Debug)] 52 struct Threads { 53 /// An ordered set of opcodes (each opcode is an NFA state). 54 set: SparseSet, 55 /// Captures for every NFA state. 56 /// 57 /// It is stored in row-major order, where the columns are the capture 58 /// slots and the rows are the states. 59 caps: Vec<Slot>, 60 /// The number of capture slots stored per thread. (Every capture has 61 /// two slots.) 62 slots_per_thread: usize, 63 } 64 65 /// A representation of an explicit stack frame when following epsilon 66 /// transitions. This is used to avoid recursion. 67 #[derive(Clone, Debug)] 68 enum FollowEpsilon { 69 /// Follow transitions at the given instruction pointer. 70 IP(InstPtr), 71 /// Restore the capture slot with the given position in the input. 72 Capture { slot: usize, pos: Slot }, 73 } 74 75 impl Cache { 76 /// Create a new allocation used by the NFA machine to record execution 77 /// and captures. new(_prog: &Program) -> Self78 pub fn new(_prog: &Program) -> Self { 79 Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } 80 } 81 } 82 83 impl<'r, I: Input> Fsm<'r, I> { 84 /// Execute the NFA matching engine. 85 /// 86 /// If there's a match, `exec` returns `true` and populates the given 87 /// captures accordingly. exec( prog: &'r Program, cache: &ProgramCache, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, input: I, start: usize, end: usize, ) -> bool88 pub fn exec( 89 prog: &'r Program, 90 cache: &ProgramCache, 91 matches: &mut [bool], 92 slots: &mut [Slot], 93 quit_after_match: bool, 94 input: I, 95 start: usize, 96 end: usize, 97 ) -> bool { 98 let mut cache = cache.borrow_mut(); 99 let cache = &mut cache.pikevm; 100 cache.clist.resize(prog.len(), prog.captures.len()); 101 cache.nlist.resize(prog.len(), prog.captures.len()); 102 let at = input.at(start); 103 Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_( 104 &mut cache.clist, 105 &mut cache.nlist, 106 matches, 107 slots, 108 quit_after_match, 109 at, 110 end, 111 ) 112 } 113 exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, mut at: InputAt, end: usize, ) -> bool114 fn exec_( 115 &mut self, 116 mut clist: &mut Threads, 117 mut nlist: &mut Threads, 118 matches: &mut [bool], 119 slots: &mut [Slot], 120 quit_after_match: bool, 121 mut at: InputAt, 122 end: usize, 123 ) -> bool { 124 let mut matched = false; 125 let mut all_matched = false; 126 clist.set.clear(); 127 nlist.set.clear(); 128 'LOOP: loop { 129 if clist.set.is_empty() { 130 // Three ways to bail out when our current set of threads is 131 // empty. 132 // 133 // 1. We have a match---so we're done exploring any possible 134 // alternatives. Time to quit. (We can't do this if we're 135 // looking for matches for multiple regexes, unless we know 136 // they all matched.) 137 // 138 // 2. If the expression starts with a '^' we can terminate as 139 // soon as the last thread dies. 140 if (matched && matches.len() <= 1) 141 || all_matched 142 || (!at.is_start() && self.prog.is_anchored_start) 143 { 144 break; 145 } 146 147 // 3. If there's a literal prefix for the program, try to 148 // jump ahead quickly. If it can't be found, then we can 149 // bail out early. 150 if !self.prog.prefixes.is_empty() { 151 at = match self.input.prefix_at(&self.prog.prefixes, at) { 152 None => break, 153 Some(at) => at, 154 }; 155 } 156 } 157 158 // This simulates a preceding '.*?' for every regex by adding 159 // a state starting at the current position in the input for the 160 // beginning of the program only if we don't already have a match. 161 if clist.set.is_empty() 162 || (!self.prog.is_anchored_start && !all_matched) 163 { 164 self.add(&mut clist, slots, 0, at); 165 } 166 // The previous call to "add" actually inspects the position just 167 // before the current character. For stepping through the machine, 168 // we can to look at the current character, so we advance the 169 // input. 170 let at_next = self.input.at(at.next_pos()); 171 for i in 0..clist.set.len() { 172 let ip = clist.set[i]; 173 if self.step( 174 &mut nlist, 175 matches, 176 slots, 177 clist.caps(ip), 178 ip, 179 at, 180 at_next, 181 ) { 182 matched = true; 183 all_matched = all_matched || matches.iter().all(|&b| b); 184 if quit_after_match { 185 // If we only care if a match occurs (not its 186 // position), then we can quit right now. 187 break 'LOOP; 188 } 189 if self.prog.matches.len() == 1 { 190 // We don't need to check the rest of the threads 191 // in this set because we've matched something 192 // ("leftmost-first"). However, we still need to check 193 // threads in the next set to support things like 194 // greedy matching. 195 // 196 // This is only true on normal regexes. For regex sets, 197 // we need to mush on to observe other matches. 198 break; 199 } 200 } 201 } 202 if at.pos() >= end { 203 break; 204 } 205 at = at_next; 206 mem::swap(clist, nlist); 207 nlist.set.clear(); 208 } 209 matched 210 } 211 212 /// Step through the input, one token (byte or codepoint) at a time. 213 /// 214 /// nlist is the set of states that will be processed on the next token 215 /// in the input. 216 /// 217 /// caps is the set of captures passed by the caller of the NFA. They are 218 /// written to only when a match state is visited. 219 /// 220 /// thread_caps is the set of captures set for the current NFA state, ip. 221 /// 222 /// at and at_next are the current and next positions in the input. at or 223 /// at_next may be EOF. step( &mut self, nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, at_next: InputAt, ) -> bool224 fn step( 225 &mut self, 226 nlist: &mut Threads, 227 matches: &mut [bool], 228 slots: &mut [Slot], 229 thread_caps: &mut [Option<usize>], 230 ip: usize, 231 at: InputAt, 232 at_next: InputAt, 233 ) -> bool { 234 use prog::Inst::*; 235 match self.prog[ip] { 236 Match(match_slot) => { 237 if match_slot < matches.len() { 238 matches[match_slot] = true; 239 } 240 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { 241 *slot = *val; 242 } 243 true 244 } 245 Char(ref inst) => { 246 if inst.c == at.char() { 247 self.add(nlist, thread_caps, inst.goto, at_next); 248 } 249 false 250 } 251 Ranges(ref inst) => { 252 if inst.matches(at.char()) { 253 self.add(nlist, thread_caps, inst.goto, at_next); 254 } 255 false 256 } 257 Bytes(ref inst) => { 258 if let Some(b) = at.byte() { 259 if inst.matches(b) { 260 self.add(nlist, thread_caps, inst.goto, at_next); 261 } 262 } 263 false 264 } 265 EmptyLook(_) | Save(_) | Split(_) => false, 266 } 267 } 268 269 /// Follows epsilon transitions and adds them for processing to nlist, 270 /// starting at and including ip. add( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, )271 fn add( 272 &mut self, 273 nlist: &mut Threads, 274 thread_caps: &mut [Option<usize>], 275 ip: usize, 276 at: InputAt, 277 ) { 278 self.stack.push(FollowEpsilon::IP(ip)); 279 while let Some(frame) = self.stack.pop() { 280 match frame { 281 FollowEpsilon::IP(ip) => { 282 self.add_step(nlist, thread_caps, ip, at); 283 } 284 FollowEpsilon::Capture { slot, pos } => { 285 thread_caps[slot] = pos; 286 } 287 } 288 } 289 } 290 291 /// A helper function for add that avoids excessive pushing to the stack. add_step( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], mut ip: usize, at: InputAt, )292 fn add_step( 293 &mut self, 294 nlist: &mut Threads, 295 thread_caps: &mut [Option<usize>], 296 mut ip: usize, 297 at: InputAt, 298 ) { 299 // Instead of pushing and popping to the stack, we mutate ip as we 300 // traverse the set of states. We only push to the stack when we 301 // absolutely need recursion (restoring captures or following a 302 // branch). 303 use prog::Inst::*; 304 loop { 305 // Don't visit states we've already added. 306 if nlist.set.contains(ip) { 307 return; 308 } 309 nlist.set.insert(ip); 310 match self.prog[ip] { 311 EmptyLook(ref inst) => { 312 if self.input.is_empty_match(at, inst) { 313 ip = inst.goto; 314 } 315 } 316 Save(ref inst) => { 317 if inst.slot < thread_caps.len() { 318 self.stack.push(FollowEpsilon::Capture { 319 slot: inst.slot, 320 pos: thread_caps[inst.slot], 321 }); 322 thread_caps[inst.slot] = Some(at.pos()); 323 } 324 ip = inst.goto; 325 } 326 Split(ref inst) => { 327 self.stack.push(FollowEpsilon::IP(inst.goto2)); 328 ip = inst.goto1; 329 } 330 Match(_) | Char(_) | Ranges(_) | Bytes(_) => { 331 let t = &mut nlist.caps(ip); 332 for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { 333 *slot = *val; 334 } 335 return; 336 } 337 } 338 } 339 } 340 } 341 342 impl Threads { new() -> Self343 fn new() -> Self { 344 Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } 345 } 346 resize(&mut self, num_insts: usize, ncaps: usize)347 fn resize(&mut self, num_insts: usize, ncaps: usize) { 348 if num_insts == self.set.capacity() { 349 return; 350 } 351 self.slots_per_thread = ncaps * 2; 352 self.set = SparseSet::new(num_insts); 353 self.caps = vec![None; self.slots_per_thread * num_insts]; 354 } 355 caps(&mut self, pc: usize) -> &mut [Option<usize>]356 fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { 357 let i = pc * self.slots_per_thread; 358 &mut self.caps[i..i + self.slots_per_thread] 359 } 360 } 361