• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This is the backtracking matching engine. It has the same exact capability
2 // as the full NFA simulation, except it is artificially restricted to small
3 // regexes on small inputs because of its memory requirements.
4 //
5 // In particular, this is a *bounded* backtracking engine. It retains worst
6 // case linear time by keeping track of the states that it has visited (using a
7 // bitmap). Namely, once a state is visited, it is never visited again. Since a
8 // state is keyed by `(instruction index, input index)`, we have that its time
9 // complexity is `O(mn)` (i.e., linear in the size of the search text).
10 //
11 // The backtracking engine can beat out the NFA simulation on small
12 // regexes/inputs because it doesn't have to keep track of multiple copies of
13 // the capture groups. In benchmarks, the backtracking engine is roughly twice
14 // as fast as the full NFA simulation. Note though that its performance doesn't
15 // scale, even if you're willing to live with the memory requirements. Namely,
16 // the bitset has to be zeroed on each execution, which becomes quite expensive
17 // on large bitsets.
18 
19 use crate::exec::ProgramCache;
20 use crate::input::{Input, InputAt};
21 use crate::prog::{InstPtr, Program};
22 use crate::re_trait::Slot;
23 
24 type Bits = u32;
25 
26 const BIT_SIZE: usize = 32;
27 const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
28 
29 /// Returns true iff the given regex and input should be executed by this
30 /// engine with reasonable memory usage.
should_exec(num_insts: usize, text_len: usize) -> bool31 pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
32     // Total memory usage in bytes is determined by:
33     //
34     //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
35     //
36     // The actual limit picked is pretty much a heuristic.
37     // See: https://github.com/rust-lang/regex/issues/215
38     let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
39     size <= MAX_SIZE_BYTES
40 }
41 
42 /// A backtracking matching engine.
43 #[derive(Debug)]
44 pub struct Bounded<'a, 'm, 'r, 's, I> {
45     prog: &'r Program,
46     input: I,
47     matches: &'m mut [bool],
48     slots: &'s mut [Slot],
49     m: &'a mut Cache,
50 }
51 
52 /// Shared cached state between multiple invocations of a backtracking engine
53 /// in the same thread.
54 #[derive(Clone, Debug)]
55 pub struct Cache {
56     jobs: Vec<Job>,
57     visited: Vec<Bits>,
58 }
59 
60 impl Cache {
61     /// Create new empty cache for the backtracking engine.
new(_prog: &Program) -> Self62     pub fn new(_prog: &Program) -> Self {
63         Cache { jobs: vec![], visited: vec![] }
64     }
65 }
66 
67 /// A job is an explicit unit of stack space in the backtracking engine.
68 ///
69 /// The "normal" representation is a single state transition, which corresponds
70 /// to an NFA state and a character in the input. However, the backtracking
71 /// engine must keep track of old capture group values. We use the explicit
72 /// stack to do it.
73 #[derive(Clone, Copy, Debug)]
74 enum Job {
75     Inst { ip: InstPtr, at: InputAt },
76     SaveRestore { slot: usize, old_pos: Option<usize> },
77 }
78 
79 impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
80     /// Execute the backtracking matching engine.
81     ///
82     /// If there's a match, `exec` returns `true` and populates the given
83     /// captures accordingly.
exec( prog: &'r Program, cache: &ProgramCache, matches: &'m mut [bool], slots: &'s mut [Slot], input: I, start: usize, end: usize, ) -> bool84     pub fn exec(
85         prog: &'r Program,
86         cache: &ProgramCache,
87         matches: &'m mut [bool],
88         slots: &'s mut [Slot],
89         input: I,
90         start: usize,
91         end: usize,
92     ) -> bool {
93         let mut cache = cache.borrow_mut();
94         let cache = &mut cache.backtrack;
95         let start = input.at(start);
96         let mut b = Bounded { prog, input, matches, slots, m: cache };
97         b.exec_(start, end)
98     }
99 
100     /// Clears the cache such that the backtracking engine can be executed
101     /// on some input of fixed length.
clear(&mut self)102     fn clear(&mut self) {
103         // Reset the job memory so that we start fresh.
104         self.m.jobs.clear();
105 
106         // Now we need to clear the bit state set.
107         // We do this by figuring out how much space we need to keep track
108         // of the states we've visited.
109         // Then we reset all existing allocated space to 0.
110         // Finally, we request more space if we need it.
111         //
112         // This is all a little circuitous, but doing this using unchecked
113         // operations doesn't seem to have a measurable impact on performance.
114         // (Probably because backtracking is limited to such small
115         // inputs/regexes in the first place.)
116         let visited_len =
117             (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
118                 / BIT_SIZE;
119         self.m.visited.truncate(visited_len);
120         for v in &mut self.m.visited {
121             *v = 0;
122         }
123         if visited_len > self.m.visited.len() {
124             let len = self.m.visited.len();
125             self.m.visited.reserve_exact(visited_len - len);
126             for _ in 0..(visited_len - len) {
127                 self.m.visited.push(0);
128             }
129         }
130     }
131 
132     /// Start backtracking at the given position in the input, but also look
133     /// for literal prefixes.
exec_(&mut self, mut at: InputAt, end: usize) -> bool134     fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
135         self.clear();
136         // If this is an anchored regex at the beginning of the input, then
137         // we're either already done or we only need to try backtracking once.
138         if self.prog.is_anchored_start {
139             return if !at.is_start() { false } else { self.backtrack(at) };
140         }
141         let mut matched = false;
142         loop {
143             if !self.prog.prefixes.is_empty() {
144                 at = match self.input.prefix_at(&self.prog.prefixes, at) {
145                     None => break,
146                     Some(at) => at,
147                 };
148             }
149             matched = self.backtrack(at) || matched;
150             if matched && self.prog.matches.len() == 1 {
151                 return true;
152             }
153             if at.pos() >= end {
154                 break;
155             }
156             at = self.input.at(at.next_pos());
157         }
158         matched
159     }
160 
161     /// The main backtracking loop starting at the given input position.
backtrack(&mut self, start: InputAt) -> bool162     fn backtrack(&mut self, start: InputAt) -> bool {
163         // N.B. We use an explicit stack to avoid recursion.
164         // To avoid excessive pushing and popping, most transitions are handled
165         // in the `step` helper function, which only pushes to the stack when
166         // there's a capture or a branch.
167         let mut matched = false;
168         self.m.jobs.push(Job::Inst { ip: 0, at: start });
169         while let Some(job) = self.m.jobs.pop() {
170             match job {
171                 Job::Inst { ip, at } => {
172                     if self.step(ip, at) {
173                         // Only quit if we're matching one regex.
174                         // If we're matching a regex set, then mush on and
175                         // try to find other matches (if we want them).
176                         if self.prog.matches.len() == 1 {
177                             return true;
178                         }
179                         matched = true;
180                     }
181                 }
182                 Job::SaveRestore { slot, old_pos } => {
183                     if slot < self.slots.len() {
184                         self.slots[slot] = old_pos;
185                     }
186                 }
187             }
188         }
189         matched
190     }
191 
step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool192     fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
193         use crate::prog::Inst::*;
194         loop {
195             // This loop is an optimization to avoid constantly pushing/popping
196             // from the stack. Namely, if we're pushing a job only to run it
197             // next, avoid the push and just mutate `ip` (and possibly `at`)
198             // in place.
199             if self.has_visited(ip, at) {
200                 return false;
201             }
202             match self.prog[ip] {
203                 Match(slot) => {
204                     if slot < self.matches.len() {
205                         self.matches[slot] = true;
206                     }
207                     return true;
208                 }
209                 Save(ref inst) => {
210                     if let Some(&old_pos) = self.slots.get(inst.slot) {
211                         // If this path doesn't work out, then we save the old
212                         // capture index (if one exists) in an alternate
213                         // job. If the next path fails, then the alternate
214                         // job is popped and the old capture index is restored.
215                         self.m.jobs.push(Job::SaveRestore {
216                             slot: inst.slot,
217                             old_pos,
218                         });
219                         self.slots[inst.slot] = Some(at.pos());
220                     }
221                     ip = inst.goto;
222                 }
223                 Split(ref inst) => {
224                     self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
225                     ip = inst.goto1;
226                 }
227                 EmptyLook(ref inst) => {
228                     if self.input.is_empty_match(at, inst) {
229                         ip = inst.goto;
230                     } else {
231                         return false;
232                     }
233                 }
234                 Char(ref inst) => {
235                     if inst.c == at.char() {
236                         ip = inst.goto;
237                         at = self.input.at(at.next_pos());
238                     } else {
239                         return false;
240                     }
241                 }
242                 Ranges(ref inst) => {
243                     if inst.matches(at.char()) {
244                         ip = inst.goto;
245                         at = self.input.at(at.next_pos());
246                     } else {
247                         return false;
248                     }
249                 }
250                 Bytes(ref inst) => {
251                     if let Some(b) = at.byte() {
252                         if inst.matches(b) {
253                             ip = inst.goto;
254                             at = self.input.at(at.next_pos());
255                             continue;
256                         }
257                     }
258                     return false;
259                 }
260             }
261         }
262     }
263 
has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool264     fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
265         let k = ip * (self.input.len() + 1) + at.pos();
266         let k1 = k / BIT_SIZE;
267         let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
268         if self.m.visited[k1] & k2 == 0 {
269             self.m.visited[k1] |= k2;
270             false
271         } else {
272             true
273         }
274     }
275 }
276 
usize_to_u32(n: usize) -> u32277 fn usize_to_u32(n: usize) -> u32 {
278     if (n as u64) > (::std::u32::MAX as u64) {
279         panic!("BUG: {} is too big to fit into u32", n)
280     }
281     n as u32
282 }
283