• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use state_id::StateID;
2 
3 /// A trait describing the interface of a deterministic finite automaton (DFA).
4 ///
5 /// Every DFA has exactly one start state and at least one dead state (which
6 /// may be the same, as in the case of an empty DFA). In all cases, a state
7 /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
8 /// always returns `true`.
9 ///
10 /// Every DFA also has zero or more match states, such that
11 /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
12 /// a match state.
13 ///
14 /// In general, users of this trait likely will only need to use the search
15 /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
16 /// methods are lower level and are used for walking the transitions of a DFA
17 /// manually. In particular, the aforementioned search routines are implemented
18 /// generically in terms of the lower level transition walking routines.
19 pub trait DFA {
20     /// The representation used for state identifiers in this DFA.
21     ///
22     /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
23     type ID: StateID;
24 
25     /// Return the identifier of this DFA's start state.
start_state(&self) -> Self::ID26     fn start_state(&self) -> Self::ID;
27 
28     /// Returns true if and only if the given identifier corresponds to a match
29     /// state.
is_match_state(&self, id: Self::ID) -> bool30     fn is_match_state(&self, id: Self::ID) -> bool;
31 
32     /// Returns true if and only if the given identifier corresponds to a dead
33     /// state. When a DFA enters a dead state, it is impossible to leave and
34     /// thus can never lead to a match.
is_dead_state(&self, id: Self::ID) -> bool35     fn is_dead_state(&self, id: Self::ID) -> bool;
36 
37     /// Returns true if and only if the given identifier corresponds to either
38     /// a dead state or a match state, such that one of `is_match_state(id)`
39     /// or `is_dead_state(id)` must return true.
40     ///
41     /// Depending on the implementation of the DFA, this routine can be used
42     /// to save a branch in the core matching loop. Nevertheless,
43     /// `is_match_state(id) || is_dead_state(id)` is always a valid
44     /// implementation.
is_match_or_dead_state(&self, id: Self::ID) -> bool45     fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
46 
47     /// Returns true if and only if this DFA is anchored.
48     ///
49     /// When a DFA is anchored, it is only allowed to report matches that
50     /// start at index `0`.
is_anchored(&self) -> bool51     fn is_anchored(&self) -> bool;
52 
53     /// Given the current state that this DFA is in and the next input byte,
54     /// this method returns the identifier of the next state. The identifier
55     /// returned is always valid, but it may correspond to a dead state.
next_state(&self, current: Self::ID, input: u8) -> Self::ID56     fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
57 
58     /// Like `next_state`, but its implementation may look up the next state
59     /// without memory safety checks such as bounds checks. As such, callers
60     /// must ensure that the given identifier corresponds to a valid DFA
61     /// state. Implementors must, in turn, ensure that this routine is safe
62     /// for all valid state identifiers and for all possible `u8` values.
next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID63     unsafe fn next_state_unchecked(
64         &self,
65         current: Self::ID,
66         input: u8,
67     ) -> Self::ID;
68 
69     /// Returns true if and only if the given bytes match this DFA.
70     ///
71     /// This routine may short circuit if it knows that scanning future input
72     /// will never lead to a different result. In particular, if a DFA enters
73     /// a match state or a dead state, then this routine will return `true` or
74     /// `false`, respectively, without inspecting any future input.
75     ///
76     /// # Example
77     ///
78     /// This example shows how to use this method with a
79     /// [`DenseDFA`](enum.DenseDFA.html).
80     ///
81     /// ```
82     /// use regex_automata::{DFA, DenseDFA};
83     ///
84     /// # fn example() -> Result<(), regex_automata::Error> {
85     /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
86     /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
87     /// assert_eq!(false, dfa.is_match(b"foobar"));
88     /// # Ok(()) }; example().unwrap()
89     /// ```
90     #[inline]
is_match(&self, bytes: &[u8]) -> bool91     fn is_match(&self, bytes: &[u8]) -> bool {
92         self.is_match_at(bytes, 0)
93     }
94 
95     /// Returns the first position at which a match is found.
96     ///
97     /// This routine stops scanning input in precisely the same circumstances
98     /// as `is_match`. The key difference is that this routine returns the
99     /// position at which it stopped scanning input if and only if a match
100     /// was found. If no match is found, then `None` is returned.
101     ///
102     /// # Example
103     ///
104     /// This example shows how to use this method with a
105     /// [`DenseDFA`](enum.DenseDFA.html).
106     ///
107     /// ```
108     /// use regex_automata::{DFA, DenseDFA};
109     ///
110     /// # fn example() -> Result<(), regex_automata::Error> {
111     /// let dfa = DenseDFA::new("foo[0-9]+")?;
112     /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
113     ///
114     /// // Normally, the end of the leftmost first match here would be 3,
115     /// // but the shortest match semantics detect a match earlier.
116     /// let dfa = DenseDFA::new("abc|a")?;
117     /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
118     /// # Ok(()) }; example().unwrap()
119     /// ```
120     #[inline]
shortest_match(&self, bytes: &[u8]) -> Option<usize>121     fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
122         self.shortest_match_at(bytes, 0)
123     }
124 
125     /// Returns the end offset of the longest match. If no match exists,
126     /// then `None` is returned.
127     ///
128     /// Implementors of this trait are not required to implement any particular
129     /// match semantics (such as leftmost-first), which are instead manifest in
130     /// the DFA's topology itself.
131     ///
132     /// In particular, this method must continue searching even after it
133     /// enters a match state. The search should only terminate once it has
134     /// reached the end of the input or when it has entered a dead state. Upon
135     /// termination, the position of the last byte seen while still in a match
136     /// state is returned.
137     ///
138     /// # Example
139     ///
140     /// This example shows how to use this method with a
141     /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
142     /// "leftmost first" match semantics.
143     ///
144     /// Leftmost first match semantics corresponds to the match with the
145     /// smallest starting offset, but where the end offset is determined by
146     /// preferring earlier branches in the original regular expression. For
147     /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
148     /// will match `Samwise` in `Samwise`.
149     ///
150     /// Generally speaking, the "leftmost first" match is how most backtracking
151     /// regular expressions tend to work. This is in contrast to POSIX-style
152     /// regular expressions that yield "leftmost longest" matches. Namely,
153     /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
154     /// leftmost longest semantics.
155     ///
156     /// ```
157     /// use regex_automata::{DFA, DenseDFA};
158     ///
159     /// # fn example() -> Result<(), regex_automata::Error> {
160     /// let dfa = DenseDFA::new("foo[0-9]+")?;
161     /// assert_eq!(Some(8), dfa.find(b"foo12345"));
162     ///
163     /// // Even though a match is found after reading the first byte (`a`),
164     /// // the leftmost first match semantics demand that we find the earliest
165     /// // match that prefers earlier parts of the pattern over latter parts.
166     /// let dfa = DenseDFA::new("abc|a")?;
167     /// assert_eq!(Some(3), dfa.find(b"abc"));
168     /// # Ok(()) }; example().unwrap()
169     /// ```
170     #[inline]
find(&self, bytes: &[u8]) -> Option<usize>171     fn find(&self, bytes: &[u8]) -> Option<usize> {
172         self.find_at(bytes, 0)
173     }
174 
175     /// Returns the start offset of the longest match in reverse, by searching
176     /// from the end of the input towards the start of the input. If no match
177     /// exists, then `None` is returned. In other words, this has the same
178     /// match semantics as `find`, but in reverse.
179     ///
180     /// # Example
181     ///
182     /// This example shows how to use this method with a
183     /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
184     /// is principally useful when used in conjunction with the
185     /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
186     /// configuration knob. In general, it's unlikely to be correct to use both
187     /// `find` and `rfind` with the same DFA since any particular DFA will only
188     /// support searching in one direction.
189     ///
190     /// ```
191     /// use regex_automata::{dense, DFA};
192     ///
193     /// # fn example() -> Result<(), regex_automata::Error> {
194     /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
195     /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
196     /// # Ok(()) }; example().unwrap()
197     /// ```
198     #[inline]
rfind(&self, bytes: &[u8]) -> Option<usize>199     fn rfind(&self, bytes: &[u8]) -> Option<usize> {
200         self.rfind_at(bytes, bytes.len())
201     }
202 
203     /// Returns the same as `is_match`, but starts the search at the given
204     /// offset.
205     ///
206     /// The significance of the starting point is that it takes the surrounding
207     /// context into consideration. For example, if the DFA is anchored, then
208     /// a match can only occur when `start == 0`.
209     #[inline]
is_match_at(&self, bytes: &[u8], start: usize) -> bool210     fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
211         if self.is_anchored() && start > 0 {
212             return false;
213         }
214 
215         let mut state = self.start_state();
216         if self.is_match_or_dead_state(state) {
217             return self.is_match_state(state);
218         }
219         for &b in bytes[start..].iter() {
220             state = unsafe { self.next_state_unchecked(state, b) };
221             if self.is_match_or_dead_state(state) {
222                 return self.is_match_state(state);
223             }
224         }
225         false
226     }
227 
228     /// Returns the same as `shortest_match`, but starts the search at the
229     /// given offset.
230     ///
231     /// The significance of the starting point is that it takes the surrounding
232     /// context into consideration. For example, if the DFA is anchored, then
233     /// a match can only occur when `start == 0`.
234     #[inline]
shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize>235     fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
236         if self.is_anchored() && start > 0 {
237             return None;
238         }
239 
240         let mut state = self.start_state();
241         if self.is_match_or_dead_state(state) {
242             return if self.is_dead_state(state) { None } else { Some(start) };
243         }
244         for (i, &b) in bytes[start..].iter().enumerate() {
245             state = unsafe { self.next_state_unchecked(state, b) };
246             if self.is_match_or_dead_state(state) {
247                 return if self.is_dead_state(state) {
248                     None
249                 } else {
250                     Some(start + i + 1)
251                 };
252             }
253         }
254         None
255     }
256 
257     /// Returns the same as `find`, but starts the search at the given
258     /// offset.
259     ///
260     /// The significance of the starting point is that it takes the surrounding
261     /// context into consideration. For example, if the DFA is anchored, then
262     /// a match can only occur when `start == 0`.
263     #[inline]
find_at(&self, bytes: &[u8], start: usize) -> Option<usize>264     fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
265         if self.is_anchored() && start > 0 {
266             return None;
267         }
268 
269         let mut state = self.start_state();
270         let mut last_match = if self.is_dead_state(state) {
271             return None;
272         } else if self.is_match_state(state) {
273             Some(start)
274         } else {
275             None
276         };
277         for (i, &b) in bytes[start..].iter().enumerate() {
278             state = unsafe { self.next_state_unchecked(state, b) };
279             if self.is_match_or_dead_state(state) {
280                 if self.is_dead_state(state) {
281                     return last_match;
282                 }
283                 last_match = Some(start + i + 1);
284             }
285         }
286         last_match
287     }
288 
289     /// Returns the same as `rfind`, but starts the search at the given
290     /// offset.
291     ///
292     /// The significance of the starting point is that it takes the surrounding
293     /// context into consideration. For example, if the DFA is anchored, then
294     /// a match can only occur when `start == bytes.len()`.
295     #[inline(never)]
rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize>296     fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
297         if self.is_anchored() && start < bytes.len() {
298             return None;
299         }
300 
301         let mut state = self.start_state();
302         let mut last_match = if self.is_dead_state(state) {
303             return None;
304         } else if self.is_match_state(state) {
305             Some(start)
306         } else {
307             None
308         };
309         for (i, &b) in bytes[..start].iter().enumerate().rev() {
310             state = unsafe { self.next_state_unchecked(state, b) };
311             if self.is_match_or_dead_state(state) {
312                 if self.is_dead_state(state) {
313                     return last_match;
314                 }
315                 last_match = Some(i);
316             }
317         }
318         last_match
319     }
320 }
321 
322 impl<'a, T: DFA> DFA for &'a T {
323     type ID = T::ID;
324 
325     #[inline]
start_state(&self) -> Self::ID326     fn start_state(&self) -> Self::ID {
327         (**self).start_state()
328     }
329 
330     #[inline]
is_match_state(&self, id: Self::ID) -> bool331     fn is_match_state(&self, id: Self::ID) -> bool {
332         (**self).is_match_state(id)
333     }
334 
335     #[inline]
is_match_or_dead_state(&self, id: Self::ID) -> bool336     fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
337         (**self).is_match_or_dead_state(id)
338     }
339 
340     #[inline]
is_dead_state(&self, id: Self::ID) -> bool341     fn is_dead_state(&self, id: Self::ID) -> bool {
342         (**self).is_dead_state(id)
343     }
344 
345     #[inline]
is_anchored(&self) -> bool346     fn is_anchored(&self) -> bool {
347         (**self).is_anchored()
348     }
349 
350     #[inline]
next_state(&self, current: Self::ID, input: u8) -> Self::ID351     fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
352         (**self).next_state(current, input)
353     }
354 
355     #[inline]
next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID356     unsafe fn next_state_unchecked(
357         &self,
358         current: Self::ID,
359         input: u8,
360     ) -> Self::ID {
361         (**self).next_state_unchecked(current, input)
362     }
363 }
364