1 use std::cmp::Ordering; 2 use std::collections::HashMap; 3 use std::fmt; 4 use std::mem; 5 use std::ops::Deref; 6 use std::slice; 7 use std::sync::Arc; 8 9 use crate::input::Char; 10 use crate::literal::LiteralSearcher; 11 12 /// `InstPtr` represents the index of an instruction in a regex program. 13 pub type InstPtr = usize; 14 15 /// Program is a sequence of instructions and various facts about thos 16 /// instructions. 17 #[derive(Clone)] 18 pub struct Program { 19 /// A sequence of instructions that represents an NFA. 20 pub insts: Vec<Inst>, 21 /// Pointers to each Match instruction in the sequence. 22 /// 23 /// This is always length 1 unless this program represents a regex set. 24 pub matches: Vec<InstPtr>, 25 /// The ordered sequence of all capture groups extracted from the AST. 26 /// Unnamed groups are `None`. 27 pub captures: Vec<Option<String>>, 28 /// Pointers to all named capture groups into `captures`. 29 pub capture_name_idx: Arc<HashMap<String, usize>>, 30 /// A pointer to the start instruction. This can vary depending on how 31 /// the program was compiled. For example, programs for use with the DFA 32 /// engine have a `.*?` inserted at the beginning of unanchored regular 33 /// expressions. The actual starting point of the program is after the 34 /// `.*?`. 35 pub start: InstPtr, 36 /// A set of equivalence classes for discriminating bytes in the compiled 37 /// program. 38 pub byte_classes: Vec<u8>, 39 /// When true, this program can only match valid UTF-8. 40 pub only_utf8: bool, 41 /// When true, this program uses byte range instructions instead of Unicode 42 /// range instructions. 43 pub is_bytes: bool, 44 /// When true, the program is compiled for DFA matching. For example, this 45 /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored 46 /// regexes. 47 pub is_dfa: bool, 48 /// When true, the program matches text in reverse (for use only in the 49 /// DFA). 50 pub is_reverse: bool, 51 /// Whether the regex must match from the start of the input. 52 pub is_anchored_start: bool, 53 /// Whether the regex must match at the end of the input. 54 pub is_anchored_end: bool, 55 /// Whether this program contains a Unicode word boundary instruction. 56 pub has_unicode_word_boundary: bool, 57 /// A possibly empty machine for very quickly matching prefix literals. 58 pub prefixes: LiteralSearcher, 59 /// A limit on the size of the cache that the DFA is allowed to use while 60 /// matching. 61 /// 62 /// The cache limit specifies approximately how much space we're willing to 63 /// give to the state cache. Once the state cache exceeds the size, it is 64 /// wiped and all states must be re-computed. 65 /// 66 /// Note that this value does not impact correctness. It can be set to 0 67 /// and the DFA will run just fine. (It will only ever store exactly one 68 /// state in the cache, and will likely run very slowly, but it will work.) 69 /// 70 /// Also note that this limit is *per thread of execution*. That is, 71 /// if the same regex is used to search text across multiple threads 72 /// simultaneously, then the DFA cache is not shared. Instead, copies are 73 /// made. 74 pub dfa_size_limit: usize, 75 } 76 77 impl Program { 78 /// Creates an empty instruction sequence. Fields are given default 79 /// values. new() -> Self80 pub fn new() -> Self { 81 Program { 82 insts: vec![], 83 matches: vec![], 84 captures: vec![], 85 capture_name_idx: Arc::new(HashMap::new()), 86 start: 0, 87 byte_classes: vec![0; 256], 88 only_utf8: true, 89 is_bytes: false, 90 is_dfa: false, 91 is_reverse: false, 92 is_anchored_start: false, 93 is_anchored_end: false, 94 has_unicode_word_boundary: false, 95 prefixes: LiteralSearcher::empty(), 96 dfa_size_limit: 2 * (1 << 20), 97 } 98 } 99 100 /// If pc is an index to a no-op instruction (like Save), then return the 101 /// next pc that is not a no-op instruction. skip(&self, mut pc: usize) -> usize102 pub fn skip(&self, mut pc: usize) -> usize { 103 loop { 104 match self[pc] { 105 Inst::Save(ref i) => pc = i.goto, 106 _ => return pc, 107 } 108 } 109 } 110 111 /// Return true if and only if an execution engine at instruction `pc` will 112 /// always lead to a match. leads_to_match(&self, pc: usize) -> bool113 pub fn leads_to_match(&self, pc: usize) -> bool { 114 if self.matches.len() > 1 { 115 // If we have a regex set, then we have more than one ending 116 // state, so leading to one of those states is generally 117 // meaningless. 118 return false; 119 } 120 match self[self.skip(pc)] { 121 Inst::Match(_) => true, 122 _ => false, 123 } 124 } 125 126 /// Returns true if the current configuration demands that an implicit 127 /// `.*?` be prepended to the instruction sequence. needs_dotstar(&self) -> bool128 pub fn needs_dotstar(&self) -> bool { 129 self.is_dfa && !self.is_reverse && !self.is_anchored_start 130 } 131 132 /// Returns true if this program uses Byte instructions instead of 133 /// Char/Range instructions. uses_bytes(&self) -> bool134 pub fn uses_bytes(&self) -> bool { 135 self.is_bytes || self.is_dfa 136 } 137 138 /// Returns true if this program exclusively matches valid UTF-8 bytes. 139 /// 140 /// That is, if an invalid UTF-8 byte is seen, then no match is possible. only_utf8(&self) -> bool141 pub fn only_utf8(&self) -> bool { 142 self.only_utf8 143 } 144 145 /// Return the approximate heap usage of this instruction sequence in 146 /// bytes. approximate_size(&self) -> usize147 pub fn approximate_size(&self) -> usize { 148 // The only instruction that uses heap space is Ranges (for 149 // Unicode codepoint programs) to store non-overlapping codepoint 150 // ranges. To keep this operation constant time, we ignore them. 151 (self.len() * mem::size_of::<Inst>()) 152 + (self.matches.len() * mem::size_of::<InstPtr>()) 153 + (self.captures.len() * mem::size_of::<Option<String>>()) 154 + (self.capture_name_idx.len() 155 * (mem::size_of::<String>() + mem::size_of::<usize>())) 156 + (self.byte_classes.len() * mem::size_of::<u8>()) 157 + self.prefixes.approximate_size() 158 } 159 } 160 161 impl Deref for Program { 162 type Target = [Inst]; 163 164 #[cfg_attr(feature = "perf-inline", inline(always))] deref(&self) -> &Self::Target165 fn deref(&self) -> &Self::Target { 166 &*self.insts 167 } 168 } 169 170 impl fmt::Debug for Program { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result171 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 172 use self::Inst::*; 173 174 fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { 175 if goto == cur + 1 { 176 fmtd 177 } else { 178 format!("{} (goto: {})", fmtd, goto) 179 } 180 } 181 182 fn visible_byte(b: u8) -> String { 183 use std::ascii::escape_default; 184 let escaped = escape_default(b).collect::<Vec<u8>>(); 185 String::from_utf8_lossy(&escaped).into_owned() 186 } 187 188 for (pc, inst) in self.iter().enumerate() { 189 match *inst { 190 Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, 191 Save(ref inst) => { 192 let s = format!("{:04} Save({})", pc, inst.slot); 193 write!(f, "{}", with_goto(pc, inst.goto, s))?; 194 } 195 Split(ref inst) => { 196 write!( 197 f, 198 "{:04} Split({}, {})", 199 pc, inst.goto1, inst.goto2 200 )?; 201 } 202 EmptyLook(ref inst) => { 203 let s = format!("{:?}", inst.look); 204 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; 205 } 206 Char(ref inst) => { 207 let s = format!("{:?}", inst.c); 208 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; 209 } 210 Ranges(ref inst) => { 211 let ranges = inst 212 .ranges 213 .iter() 214 .map(|r| format!("{:?}-{:?}", r.0, r.1)) 215 .collect::<Vec<String>>() 216 .join(", "); 217 write!( 218 f, 219 "{:04} {}", 220 pc, 221 with_goto(pc, inst.goto, ranges) 222 )?; 223 } 224 Bytes(ref inst) => { 225 let s = format!( 226 "Bytes({}, {})", 227 visible_byte(inst.start), 228 visible_byte(inst.end) 229 ); 230 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; 231 } 232 } 233 if pc == self.start { 234 write!(f, " (start)")?; 235 } 236 writeln!(f)?; 237 } 238 Ok(()) 239 } 240 } 241 242 impl<'a> IntoIterator for &'a Program { 243 type Item = &'a Inst; 244 type IntoIter = slice::Iter<'a, Inst>; into_iter(self) -> Self::IntoIter245 fn into_iter(self) -> Self::IntoIter { 246 self.iter() 247 } 248 } 249 250 /// Inst is an instruction code in a Regex program. 251 /// 252 /// Regrettably, a regex program either contains Unicode codepoint 253 /// instructions (Char and Ranges) or it contains byte instructions (Bytes). 254 /// A regex program can never contain both. 255 /// 256 /// It would be worth investigating splitting this into two distinct types and 257 /// then figuring out how to make the matching engines polymorphic over those 258 /// types without sacrificing performance. 259 /// 260 /// Other than the benefit of moving invariants into the type system, another 261 /// benefit is the decreased size. If we remove the `Char` and `Ranges` 262 /// instructions from the `Inst` enum, then its size shrinks from 32 bytes to 263 /// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` 264 /// variant.) Given that byte based machines are typically much bigger than 265 /// their Unicode analogues (because they can decode UTF-8 directly), this ends 266 /// up being a pretty significant savings. 267 #[derive(Clone, Debug)] 268 pub enum Inst { 269 /// Match indicates that the program has reached a match state. 270 /// 271 /// The number in the match corresponds to the Nth logical regular 272 /// expression in this program. This index is always 0 for normal regex 273 /// programs. Values greater than 0 appear when compiling regex sets, and 274 /// each match instruction gets its own unique value. The value corresponds 275 /// to the Nth regex in the set. 276 Match(usize), 277 /// Save causes the program to save the current location of the input in 278 /// the slot indicated by InstSave. 279 Save(InstSave), 280 /// Split causes the program to diverge to one of two paths in the 281 /// program, preferring goto1 in InstSplit. 282 Split(InstSplit), 283 /// EmptyLook represents a zero-width assertion in a regex program. A 284 /// zero-width assertion does not consume any of the input text. 285 EmptyLook(InstEmptyLook), 286 /// Char requires the regex program to match the character in InstChar at 287 /// the current position in the input. 288 Char(InstChar), 289 /// Ranges requires the regex program to match the character at the current 290 /// position in the input with one of the ranges specified in InstRanges. 291 Ranges(InstRanges), 292 /// Bytes is like Ranges, except it expresses a single byte range. It is 293 /// used in conjunction with Split instructions to implement multi-byte 294 /// character classes. 295 Bytes(InstBytes), 296 } 297 298 impl Inst { 299 /// Returns true if and only if this is a match instruction. is_match(&self) -> bool300 pub fn is_match(&self) -> bool { 301 match *self { 302 Inst::Match(_) => true, 303 _ => false, 304 } 305 } 306 } 307 308 /// Representation of the Save instruction. 309 #[derive(Clone, Debug)] 310 pub struct InstSave { 311 /// The next location to execute in the program. 312 pub goto: InstPtr, 313 /// The capture slot (there are two slots for every capture in a regex, 314 /// including the zeroth capture for the entire match). 315 pub slot: usize, 316 } 317 318 /// Representation of the Split instruction. 319 #[derive(Clone, Debug)] 320 pub struct InstSplit { 321 /// The first instruction to try. A match resulting from following goto1 322 /// has precedence over a match resulting from following goto2. 323 pub goto1: InstPtr, 324 /// The second instruction to try. A match resulting from following goto1 325 /// has precedence over a match resulting from following goto2. 326 pub goto2: InstPtr, 327 } 328 329 /// Representation of the `EmptyLook` instruction. 330 #[derive(Clone, Debug)] 331 pub struct InstEmptyLook { 332 /// The next location to execute in the program if this instruction 333 /// succeeds. 334 pub goto: InstPtr, 335 /// The type of zero-width assertion to check. 336 pub look: EmptyLook, 337 } 338 339 /// The set of zero-width match instructions. 340 #[derive(Clone, Copy, Debug, PartialEq, Eq)] 341 pub enum EmptyLook { 342 /// Start of line or input. 343 StartLine, 344 /// End of line or input. 345 EndLine, 346 /// Start of input. 347 StartText, 348 /// End of input. 349 EndText, 350 /// Word character on one side and non-word character on other. 351 WordBoundary, 352 /// Word character on both sides or non-word character on both sides. 353 NotWordBoundary, 354 /// ASCII word boundary. 355 WordBoundaryAscii, 356 /// Not ASCII word boundary. 357 NotWordBoundaryAscii, 358 } 359 360 /// Representation of the Char instruction. 361 #[derive(Clone, Debug)] 362 pub struct InstChar { 363 /// The next location to execute in the program if this instruction 364 /// succeeds. 365 pub goto: InstPtr, 366 /// The character to test. 367 pub c: char, 368 } 369 370 /// Representation of the Ranges instruction. 371 #[derive(Clone, Debug)] 372 pub struct InstRanges { 373 /// The next location to execute in the program if this instruction 374 /// succeeds. 375 pub goto: InstPtr, 376 /// The set of Unicode scalar value ranges to test. 377 pub ranges: Box<[(char, char)]>, 378 } 379 380 impl InstRanges { 381 /// Tests whether the given input character matches this instruction. matches(&self, c: Char) -> bool382 pub fn matches(&self, c: Char) -> bool { 383 // This speeds up the `match_class_unicode` benchmark by checking 384 // some common cases quickly without binary search. e.g., Matching 385 // a Unicode class on predominantly ASCII text. 386 for r in self.ranges.iter().take(4) { 387 if c < r.0 { 388 return false; 389 } 390 if c <= r.1 { 391 return true; 392 } 393 } 394 self.ranges 395 .binary_search_by(|r| { 396 if r.1 < c { 397 Ordering::Less 398 } else if r.0 > c { 399 Ordering::Greater 400 } else { 401 Ordering::Equal 402 } 403 }) 404 .is_ok() 405 } 406 407 /// Return the number of distinct characters represented by all of the 408 /// ranges. num_chars(&self) -> usize409 pub fn num_chars(&self) -> usize { 410 self.ranges 411 .iter() 412 .map(|&(s, e)| 1 + (e as u32) - (s as u32)) 413 .sum::<u32>() as usize 414 } 415 } 416 417 /// Representation of the Bytes instruction. 418 #[derive(Clone, Debug)] 419 pub struct InstBytes { 420 /// The next location to execute in the program if this instruction 421 /// succeeds. 422 pub goto: InstPtr, 423 /// The start (inclusive) of this byte range. 424 pub start: u8, 425 /// The end (inclusive) of this byte range. 426 pub end: u8, 427 } 428 429 impl InstBytes { 430 /// Returns true if and only if the given byte is in this range. matches(&self, byte: u8) -> bool431 pub fn matches(&self, byte: u8) -> bool { 432 self.start <= byte && byte <= self.end 433 } 434 } 435 436 #[cfg(test)] 437 mod test { 438 #[test] 439 #[cfg(target_pointer_width = "64")] test_size_of_inst()440 fn test_size_of_inst() { 441 use std::mem::size_of; 442 443 use super::Inst; 444 445 assert_eq!(32, size_of::<Inst>()); 446 } 447 } 448