1 use state_id::StateID; 2 3 /// A trait describing the interface of a deterministic finite automaton (DFA). 4 /// 5 /// Every DFA has exactly one start state and at least one dead state (which 6 /// may be the same, as in the case of an empty DFA). In all cases, a state 7 /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` 8 /// always returns `true`. 9 /// 10 /// Every DFA also has zero or more match states, such that 11 /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to 12 /// a match state. 13 /// 14 /// In general, users of this trait likely will only need to use the search 15 /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other 16 /// methods are lower level and are used for walking the transitions of a DFA 17 /// manually. In particular, the aforementioned search routines are implemented 18 /// generically in terms of the lower level transition walking routines. 19 pub trait DFA { 20 /// The representation used for state identifiers in this DFA. 21 /// 22 /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. 23 type ID: StateID; 24 25 /// Return the identifier of this DFA's start state. start_state(&self) -> Self::ID26 fn start_state(&self) -> Self::ID; 27 28 /// Returns true if and only if the given identifier corresponds to a match 29 /// state. is_match_state(&self, id: Self::ID) -> bool30 fn is_match_state(&self, id: Self::ID) -> bool; 31 32 /// Returns true if and only if the given identifier corresponds to a dead 33 /// state. When a DFA enters a dead state, it is impossible to leave and 34 /// thus can never lead to a match. is_dead_state(&self, id: Self::ID) -> bool35 fn is_dead_state(&self, id: Self::ID) -> bool; 36 37 /// Returns true if and only if the given identifier corresponds to either 38 /// a dead state or a match state, such that one of `is_match_state(id)` 39 /// or `is_dead_state(id)` must return true. 40 /// 41 /// Depending on the implementation of the DFA, this routine can be used 42 /// to save a branch in the core matching loop. Nevertheless, 43 /// `is_match_state(id) || is_dead_state(id)` is always a valid 44 /// implementation. is_match_or_dead_state(&self, id: Self::ID) -> bool45 fn is_match_or_dead_state(&self, id: Self::ID) -> bool; 46 47 /// Returns true if and only if this DFA is anchored. 48 /// 49 /// When a DFA is anchored, it is only allowed to report matches that 50 /// start at index `0`. is_anchored(&self) -> bool51 fn is_anchored(&self) -> bool; 52 53 /// Given the current state that this DFA is in and the next input byte, 54 /// this method returns the identifier of the next state. The identifier 55 /// returned is always valid, but it may correspond to a dead state. next_state(&self, current: Self::ID, input: u8) -> Self::ID56 fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; 57 58 /// Like `next_state`, but its implementation may look up the next state 59 /// without memory safety checks such as bounds checks. As such, callers 60 /// must ensure that the given identifier corresponds to a valid DFA 61 /// state. Implementors must, in turn, ensure that this routine is safe 62 /// for all valid state identifiers and for all possible `u8` values. next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID63 unsafe fn next_state_unchecked( 64 &self, 65 current: Self::ID, 66 input: u8, 67 ) -> Self::ID; 68 69 /// Returns true if and only if the given bytes match this DFA. 70 /// 71 /// This routine may short circuit if it knows that scanning future input 72 /// will never lead to a different result. In particular, if a DFA enters 73 /// a match state or a dead state, then this routine will return `true` or 74 /// `false`, respectively, without inspecting any future input. 75 /// 76 /// # Example 77 /// 78 /// This example shows how to use this method with a 79 /// [`DenseDFA`](enum.DenseDFA.html). 80 /// 81 /// ``` 82 /// use regex_automata::{DFA, DenseDFA}; 83 /// 84 /// # fn example() -> Result<(), regex_automata::Error> { 85 /// let dfa = DenseDFA::new("foo[0-9]+bar")?; 86 /// assert_eq!(true, dfa.is_match(b"foo12345bar")); 87 /// assert_eq!(false, dfa.is_match(b"foobar")); 88 /// # Ok(()) }; example().unwrap() 89 /// ``` 90 #[inline] is_match(&self, bytes: &[u8]) -> bool91 fn is_match(&self, bytes: &[u8]) -> bool { 92 self.is_match_at(bytes, 0) 93 } 94 95 /// Returns the first position at which a match is found. 96 /// 97 /// This routine stops scanning input in precisely the same circumstances 98 /// as `is_match`. The key difference is that this routine returns the 99 /// position at which it stopped scanning input if and only if a match 100 /// was found. If no match is found, then `None` is returned. 101 /// 102 /// # Example 103 /// 104 /// This example shows how to use this method with a 105 /// [`DenseDFA`](enum.DenseDFA.html). 106 /// 107 /// ``` 108 /// use regex_automata::{DFA, DenseDFA}; 109 /// 110 /// # fn example() -> Result<(), regex_automata::Error> { 111 /// let dfa = DenseDFA::new("foo[0-9]+")?; 112 /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); 113 /// 114 /// // Normally, the end of the leftmost first match here would be 3, 115 /// // but the shortest match semantics detect a match earlier. 116 /// let dfa = DenseDFA::new("abc|a")?; 117 /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); 118 /// # Ok(()) }; example().unwrap() 119 /// ``` 120 #[inline] shortest_match(&self, bytes: &[u8]) -> Option<usize>121 fn shortest_match(&self, bytes: &[u8]) -> Option<usize> { 122 self.shortest_match_at(bytes, 0) 123 } 124 125 /// Returns the end offset of the longest match. If no match exists, 126 /// then `None` is returned. 127 /// 128 /// Implementors of this trait are not required to implement any particular 129 /// match semantics (such as leftmost-first), which are instead manifest in 130 /// the DFA's topology itself. 131 /// 132 /// In particular, this method must continue searching even after it 133 /// enters a match state. The search should only terminate once it has 134 /// reached the end of the input or when it has entered a dead state. Upon 135 /// termination, the position of the last byte seen while still in a match 136 /// state is returned. 137 /// 138 /// # Example 139 /// 140 /// This example shows how to use this method with a 141 /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses 142 /// "leftmost first" match semantics. 143 /// 144 /// Leftmost first match semantics corresponds to the match with the 145 /// smallest starting offset, but where the end offset is determined by 146 /// preferring earlier branches in the original regular expression. For 147 /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` 148 /// will match `Samwise` in `Samwise`. 149 /// 150 /// Generally speaking, the "leftmost first" match is how most backtracking 151 /// regular expressions tend to work. This is in contrast to POSIX-style 152 /// regular expressions that yield "leftmost longest" matches. Namely, 153 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using 154 /// leftmost longest semantics. 155 /// 156 /// ``` 157 /// use regex_automata::{DFA, DenseDFA}; 158 /// 159 /// # fn example() -> Result<(), regex_automata::Error> { 160 /// let dfa = DenseDFA::new("foo[0-9]+")?; 161 /// assert_eq!(Some(8), dfa.find(b"foo12345")); 162 /// 163 /// // Even though a match is found after reading the first byte (`a`), 164 /// // the leftmost first match semantics demand that we find the earliest 165 /// // match that prefers earlier parts of the pattern over latter parts. 166 /// let dfa = DenseDFA::new("abc|a")?; 167 /// assert_eq!(Some(3), dfa.find(b"abc")); 168 /// # Ok(()) }; example().unwrap() 169 /// ``` 170 #[inline] find(&self, bytes: &[u8]) -> Option<usize>171 fn find(&self, bytes: &[u8]) -> Option<usize> { 172 self.find_at(bytes, 0) 173 } 174 175 /// Returns the start offset of the longest match in reverse, by searching 176 /// from the end of the input towards the start of the input. If no match 177 /// exists, then `None` is returned. In other words, this has the same 178 /// match semantics as `find`, but in reverse. 179 /// 180 /// # Example 181 /// 182 /// This example shows how to use this method with a 183 /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine 184 /// is principally useful when used in conjunction with the 185 /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) 186 /// configuration knob. In general, it's unlikely to be correct to use both 187 /// `find` and `rfind` with the same DFA since any particular DFA will only 188 /// support searching in one direction. 189 /// 190 /// ``` 191 /// use regex_automata::{dense, DFA}; 192 /// 193 /// # fn example() -> Result<(), regex_automata::Error> { 194 /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; 195 /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); 196 /// # Ok(()) }; example().unwrap() 197 /// ``` 198 #[inline] rfind(&self, bytes: &[u8]) -> Option<usize>199 fn rfind(&self, bytes: &[u8]) -> Option<usize> { 200 self.rfind_at(bytes, bytes.len()) 201 } 202 203 /// Returns the same as `is_match`, but starts the search at the given 204 /// offset. 205 /// 206 /// The significance of the starting point is that it takes the surrounding 207 /// context into consideration. For example, if the DFA is anchored, then 208 /// a match can only occur when `start == 0`. 209 #[inline] is_match_at(&self, bytes: &[u8], start: usize) -> bool210 fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { 211 if self.is_anchored() && start > 0 { 212 return false; 213 } 214 215 let mut state = self.start_state(); 216 if self.is_match_or_dead_state(state) { 217 return self.is_match_state(state); 218 } 219 for &b in bytes[start..].iter() { 220 state = unsafe { self.next_state_unchecked(state, b) }; 221 if self.is_match_or_dead_state(state) { 222 return self.is_match_state(state); 223 } 224 } 225 false 226 } 227 228 /// Returns the same as `shortest_match`, but starts the search at the 229 /// given offset. 230 /// 231 /// The significance of the starting point is that it takes the surrounding 232 /// context into consideration. For example, if the DFA is anchored, then 233 /// a match can only occur when `start == 0`. 234 #[inline] shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize>235 fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { 236 if self.is_anchored() && start > 0 { 237 return None; 238 } 239 240 let mut state = self.start_state(); 241 if self.is_match_or_dead_state(state) { 242 return if self.is_dead_state(state) { None } else { Some(start) }; 243 } 244 for (i, &b) in bytes[start..].iter().enumerate() { 245 state = unsafe { self.next_state_unchecked(state, b) }; 246 if self.is_match_or_dead_state(state) { 247 return if self.is_dead_state(state) { 248 None 249 } else { 250 Some(start + i + 1) 251 }; 252 } 253 } 254 None 255 } 256 257 /// Returns the same as `find`, but starts the search at the given 258 /// offset. 259 /// 260 /// The significance of the starting point is that it takes the surrounding 261 /// context into consideration. For example, if the DFA is anchored, then 262 /// a match can only occur when `start == 0`. 263 #[inline] find_at(&self, bytes: &[u8], start: usize) -> Option<usize>264 fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { 265 if self.is_anchored() && start > 0 { 266 return None; 267 } 268 269 let mut state = self.start_state(); 270 let mut last_match = if self.is_dead_state(state) { 271 return None; 272 } else if self.is_match_state(state) { 273 Some(start) 274 } else { 275 None 276 }; 277 for (i, &b) in bytes[start..].iter().enumerate() { 278 state = unsafe { self.next_state_unchecked(state, b) }; 279 if self.is_match_or_dead_state(state) { 280 if self.is_dead_state(state) { 281 return last_match; 282 } 283 last_match = Some(start + i + 1); 284 } 285 } 286 last_match 287 } 288 289 /// Returns the same as `rfind`, but starts the search at the given 290 /// offset. 291 /// 292 /// The significance of the starting point is that it takes the surrounding 293 /// context into consideration. For example, if the DFA is anchored, then 294 /// a match can only occur when `start == bytes.len()`. 295 #[inline(never)] rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize>296 fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { 297 if self.is_anchored() && start < bytes.len() { 298 return None; 299 } 300 301 let mut state = self.start_state(); 302 let mut last_match = if self.is_dead_state(state) { 303 return None; 304 } else if self.is_match_state(state) { 305 Some(start) 306 } else { 307 None 308 }; 309 for (i, &b) in bytes[..start].iter().enumerate().rev() { 310 state = unsafe { self.next_state_unchecked(state, b) }; 311 if self.is_match_or_dead_state(state) { 312 if self.is_dead_state(state) { 313 return last_match; 314 } 315 last_match = Some(i); 316 } 317 } 318 last_match 319 } 320 } 321 322 impl<'a, T: DFA> DFA for &'a T { 323 type ID = T::ID; 324 325 #[inline] start_state(&self) -> Self::ID326 fn start_state(&self) -> Self::ID { 327 (**self).start_state() 328 } 329 330 #[inline] is_match_state(&self, id: Self::ID) -> bool331 fn is_match_state(&self, id: Self::ID) -> bool { 332 (**self).is_match_state(id) 333 } 334 335 #[inline] is_match_or_dead_state(&self, id: Self::ID) -> bool336 fn is_match_or_dead_state(&self, id: Self::ID) -> bool { 337 (**self).is_match_or_dead_state(id) 338 } 339 340 #[inline] is_dead_state(&self, id: Self::ID) -> bool341 fn is_dead_state(&self, id: Self::ID) -> bool { 342 (**self).is_dead_state(id) 343 } 344 345 #[inline] is_anchored(&self) -> bool346 fn is_anchored(&self) -> bool { 347 (**self).is_anchored() 348 } 349 350 #[inline] next_state(&self, current: Self::ID, input: u8) -> Self::ID351 fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { 352 (**self).next_state(current, input) 353 } 354 355 #[inline] next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID356 unsafe fn next_state_unchecked( 357 &self, 358 current: Self::ID, 359 input: u8, 360 ) -> Self::ID { 361 (**self).next_state_unchecked(current, input) 362 } 363 } 364