• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6 
7 use regex_syntax::hir::{self, Hir};
8 use regex_syntax::is_word_byte;
9 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10 
11 use crate::prog::{
12     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13     InstSave, InstSplit, Program,
14 };
15 
16 use crate::Error;
17 
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20 
21 #[derive(Debug)]
22 struct Patch {
23     hole: Hole,
24     entry: InstPtr,
25 }
26 
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33     insts: Vec<MaybeInst>,
34     compiled: Program,
35     capture_name_idx: HashMap<String, usize>,
36     num_exprs: usize,
37     size_limit: usize,
38     suffix_cache: SuffixCache,
39     utf8_seqs: Option<Utf8Sequences>,
40     byte_classes: ByteClassSet,
41     // This keeps track of extra bytes allocated while compiling the regex
42     // program. Currently, this corresponds to two things. First is the heap
43     // memory allocated by Unicode character classes ('InstRanges'). Second is
44     // a "fake" amount of memory used by empty sub-expressions, so that enough
45     // empty sub-expressions will ultimately trigger the compiler to bail
46     // because of a size limit restriction. (That empty sub-expressions don't
47     // add to heap memory usage is more-or-less an implementation detail.) In
48     // the second case, if we don't bail, then an excessively large repetition
49     // on an empty sub-expression can result in the compiler using a very large
50     // amount of CPU time.
51     extra_inst_bytes: usize,
52 }
53 
54 impl Compiler {
55     /// Create a new regular expression compiler.
56     ///
57     /// Various options can be set before calling `compile` on an expression.
new() -> Self58     pub fn new() -> Self {
59         Compiler {
60             insts: vec![],
61             compiled: Program::new(),
62             capture_name_idx: HashMap::new(),
63             num_exprs: 0,
64             size_limit: 10 * (1 << 20),
65             suffix_cache: SuffixCache::new(1000),
66             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67             byte_classes: ByteClassSet::new(),
68             extra_inst_bytes: 0,
69         }
70     }
71 
72     /// The size of the resulting program is limited by size_limit. If
73     /// the program approximately exceeds the given size (in bytes), then
74     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self75     pub fn size_limit(mut self, size_limit: usize) -> Self {
76         self.size_limit = size_limit;
77         self
78     }
79 
80     /// If bytes is true, then the program is compiled as a byte based
81     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82     /// false, then the automaton is Unicode scalar value based, e.g., an
83     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84     ///
85     /// The specific invariant is that when returning a byte based machine,
86     /// the neither the `Char` nor `Ranges` instructions are produced.
87     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88     /// instruction is never produced.
89     ///
90     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self91     pub fn bytes(mut self, yes: bool) -> Self {
92         self.compiled.is_bytes = yes;
93         self
94     }
95 
96     /// When disabled, the program compiled may match arbitrary bytes.
97     ///
98     /// When enabled (the default), all compiled programs exclusively match
99     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self100     pub fn only_utf8(mut self, yes: bool) -> Self {
101         self.compiled.only_utf8 = yes;
102         self
103     }
104 
105     /// When set, the machine returned is suitable for use in the DFA matching
106     /// engine.
107     ///
108     /// In particular, this ensures that if the regex is not anchored in the
109     /// beginning, then a preceding `.*?` is included in the program. (The NFA
110     /// based engines handle the preceding `.*?` explicitly, which is difficult
111     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self112     pub fn dfa(mut self, yes: bool) -> Self {
113         self.compiled.is_dfa = yes;
114         self
115     }
116 
117     /// When set, the machine returned is suitable for matching text in
118     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self119     pub fn reverse(mut self, yes: bool) -> Self {
120         self.compiled.is_reverse = yes;
121         self
122     }
123 
124     /// Compile a regular expression given its AST.
125     ///
126     /// The compiler is guaranteed to succeed unless the program exceeds the
127     /// specified size limit. If the size limit is exceeded, then compilation
128     /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>129     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130         debug_assert!(!exprs.is_empty());
131         self.num_exprs = exprs.len();
132         if exprs.len() == 1 {
133             self.compile_one(&exprs[0])
134         } else {
135             self.compile_many(exprs)
136         }
137     }
138 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>139     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140         // If we're compiling a forward DFA and we aren't anchored, then
141         // add a `.*?` before the first capture group.
142         // Other matching engines handle this by baking the logic into the
143         // matching engine itself.
144         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145         self.compiled.is_anchored_start = expr.is_anchored_start();
146         self.compiled.is_anchored_end = expr.is_anchored_end();
147         if self.compiled.needs_dotstar() {
148             dotstar_patch = self.c_dotstar()?;
149             self.compiled.start = dotstar_patch.entry;
150         }
151         self.compiled.captures = vec![None];
152         let patch =
153             self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
154         if self.compiled.needs_dotstar() {
155             self.fill(dotstar_patch.hole, patch.entry);
156         } else {
157             self.compiled.start = patch.entry;
158         }
159         self.fill_to_next(patch.hole);
160         self.compiled.matches = vec![self.insts.len()];
161         self.push_compiled(Inst::Match(0));
162         self.compile_finish()
163     }
164 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>165     fn compile_many(
166         mut self,
167         exprs: &[Hir],
168     ) -> result::Result<Program, Error> {
169         debug_assert!(exprs.len() > 1);
170 
171         self.compiled.is_anchored_start =
172             exprs.iter().all(|e| e.is_anchored_start());
173         self.compiled.is_anchored_end =
174             exprs.iter().all(|e| e.is_anchored_end());
175         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
176         if self.compiled.needs_dotstar() {
177             dotstar_patch = self.c_dotstar()?;
178             self.compiled.start = dotstar_patch.entry;
179         } else {
180             self.compiled.start = 0; // first instruction is always split
181         }
182         self.fill_to_next(dotstar_patch.hole);
183 
184         let mut prev_hole = Hole::None;
185         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
186             self.fill_to_next(prev_hole);
187             let split = self.push_split_hole();
188             let Patch { hole, entry } =
189                 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
190             self.fill_to_next(hole);
191             self.compiled.matches.push(self.insts.len());
192             self.push_compiled(Inst::Match(i));
193             prev_hole = self.fill_split(split, Some(entry), None);
194         }
195         let i = exprs.len() - 1;
196         let Patch { hole, entry } =
197             self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
198         self.fill(prev_hole, entry);
199         self.fill_to_next(hole);
200         self.compiled.matches.push(self.insts.len());
201         self.push_compiled(Inst::Match(i));
202         self.compile_finish()
203     }
204 
compile_finish(mut self) -> result::Result<Program, Error>205     fn compile_finish(mut self) -> result::Result<Program, Error> {
206         self.compiled.insts =
207             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
208         self.compiled.byte_classes = self.byte_classes.byte_classes();
209         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
210         Ok(self.compiled)
211     }
212 
213     /// Compile expr into self.insts, returning a patch on success,
214     /// or an error if we run out of memory.
215     ///
216     /// All of the c_* methods of the compiler share the contract outlined
217     /// here.
218     ///
219     /// The main thing that a c_* method does is mutate `self.insts`
220     /// to add a list of mostly compiled instructions required to execute
221     /// the given expression. `self.insts` contains MaybeInsts rather than
222     /// Insts because there is some backpatching required.
223     ///
224     /// The `Patch` value returned by each c_* method provides metadata
225     /// about the compiled instructions emitted to `self.insts`. The
226     /// `entry` member of the patch refers to the first instruction
227     /// (the entry point), while the `hole` member contains zero or
228     /// more offsets to partial instructions that need to be backpatched.
229     /// The c_* routine can't know where its list of instructions are going to
230     /// jump to after execution, so it is up to the caller to patch
231     /// these jumps to point to the right place. So compiling some
232     /// expression, e, we would end up with a situation that looked like:
233     ///
234     /// ```text
235     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
236     ///                     ^              ^             ^
237     ///                     |                \         /
238     ///                   entry                \     /
239     ///                                         hole
240     /// ```
241     ///
242     /// To compile two expressions, e1 and e2, concatenated together we
243     /// would do:
244     ///
245     /// ```ignore
246     /// let patch1 = self.c(e1);
247     /// let patch2 = self.c(e2);
248     /// ```
249     ///
250     /// while leaves us with a situation that looks like
251     ///
252     /// ```text
253     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
254     ///                     ^        ^            ^        ^
255     ///                     |        |            |        |
256     ///                entry1        hole1   entry2        hole2
257     /// ```
258     ///
259     /// Then to merge the two patches together into one we would backpatch
260     /// hole1 with entry2 and return a new patch that enters at entry1
261     /// and has hole2 for a hole. In fact, if you look at the c_concat
262     /// method you will see that it does exactly this, though it handles
263     /// a list of expressions rather than just the two that we use for
264     /// an example.
265     ///
266     /// Ok(None) is returned when an expression is compiled to no
267     /// instruction, and so no patch.entry value makes sense.
c(&mut self, expr: &Hir) -> ResultOrEmpty268     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
269         use crate::prog;
270         use regex_syntax::hir::HirKind::*;
271 
272         self.check_size()?;
273         match *expr.kind() {
274             Empty => self.c_empty(),
275             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
276             Literal(hir::Literal::Byte(b)) => {
277                 assert!(self.compiled.uses_bytes());
278                 self.c_byte(b)
279             }
280             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
281             Class(hir::Class::Bytes(ref cls)) => {
282                 if self.compiled.uses_bytes() {
283                     self.c_class_bytes(cls.ranges())
284                 } else {
285                     assert!(cls.is_all_ascii());
286                     let mut char_ranges = vec![];
287                     for r in cls.iter() {
288                         let (s, e) = (r.start() as char, r.end() as char);
289                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
290                     }
291                     self.c_class(&char_ranges)
292                 }
293             }
294             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
295                 self.byte_classes.set_range(b'\n', b'\n');
296                 self.c_empty_look(prog::EmptyLook::EndLine)
297             }
298             Anchor(hir::Anchor::StartLine) => {
299                 self.byte_classes.set_range(b'\n', b'\n');
300                 self.c_empty_look(prog::EmptyLook::StartLine)
301             }
302             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
303                 self.byte_classes.set_range(b'\n', b'\n');
304                 self.c_empty_look(prog::EmptyLook::StartLine)
305             }
306             Anchor(hir::Anchor::EndLine) => {
307                 self.byte_classes.set_range(b'\n', b'\n');
308                 self.c_empty_look(prog::EmptyLook::EndLine)
309             }
310             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
311                 self.c_empty_look(prog::EmptyLook::EndText)
312             }
313             Anchor(hir::Anchor::StartText) => {
314                 self.c_empty_look(prog::EmptyLook::StartText)
315             }
316             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
317                 self.c_empty_look(prog::EmptyLook::StartText)
318             }
319             Anchor(hir::Anchor::EndText) => {
320                 self.c_empty_look(prog::EmptyLook::EndText)
321             }
322             WordBoundary(hir::WordBoundary::Unicode) => {
323                 if !cfg!(feature = "unicode-perl") {
324                     return Err(Error::Syntax(
325                         "Unicode word boundaries are unavailable when \
326                          the unicode-perl feature is disabled"
327                             .to_string(),
328                     ));
329                 }
330                 self.compiled.has_unicode_word_boundary = true;
331                 self.byte_classes.set_word_boundary();
332                 // We also make sure that all ASCII bytes are in a different
333                 // class from non-ASCII bytes. Otherwise, it's possible for
334                 // ASCII bytes to get lumped into the same class as non-ASCII
335                 // bytes. This in turn may cause the lazy DFA to falsely start
336                 // when it sees an ASCII byte that maps to a byte class with
337                 // non-ASCII bytes. This ensures that never happens.
338                 self.byte_classes.set_range(0, 0x7F);
339                 self.c_empty_look(prog::EmptyLook::WordBoundary)
340             }
341             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
342                 if !cfg!(feature = "unicode-perl") {
343                     return Err(Error::Syntax(
344                         "Unicode word boundaries are unavailable when \
345                          the unicode-perl feature is disabled"
346                             .to_string(),
347                     ));
348                 }
349                 self.compiled.has_unicode_word_boundary = true;
350                 self.byte_classes.set_word_boundary();
351                 // See comments above for why we set the ASCII range here.
352                 self.byte_classes.set_range(0, 0x7F);
353                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
354             }
355             WordBoundary(hir::WordBoundary::Ascii) => {
356                 self.byte_classes.set_word_boundary();
357                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
358             }
359             WordBoundary(hir::WordBoundary::AsciiNegate) => {
360                 self.byte_classes.set_word_boundary();
361                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
362             }
363             Group(ref g) => match g.kind {
364                 hir::GroupKind::NonCapturing => self.c(&g.hir),
365                 hir::GroupKind::CaptureIndex(index) => {
366                     if index as usize >= self.compiled.captures.len() {
367                         self.compiled.captures.push(None);
368                     }
369                     self.c_capture(2 * index as usize, &g.hir)
370                 }
371                 hir::GroupKind::CaptureName { index, ref name } => {
372                     if index as usize >= self.compiled.captures.len() {
373                         let n = name.to_string();
374                         self.compiled.captures.push(Some(n.clone()));
375                         self.capture_name_idx.insert(n, index as usize);
376                     }
377                     self.c_capture(2 * index as usize, &g.hir)
378                 }
379             },
380             Concat(ref es) => {
381                 if self.compiled.is_reverse {
382                     self.c_concat(es.iter().rev())
383                 } else {
384                     self.c_concat(es)
385                 }
386             }
387             Alternation(ref es) => self.c_alternate(&**es),
388             Repetition(ref rep) => self.c_repeat(rep),
389         }
390     }
391 
c_empty(&mut self) -> ResultOrEmpty392     fn c_empty(&mut self) -> ResultOrEmpty {
393         // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
394         // See: CVE-2022-24713
395         //
396         // Since 'empty' sub-expressions don't increase the size of
397         // the actual compiled object, we "fake" an increase in its
398         // size so that our 'check_size_limit' routine will eventually
399         // stop compilation if there are too many empty sub-expressions
400         // (e.g., via a large repetition).
401         self.extra_inst_bytes += std::mem::size_of::<Inst>();
402         Ok(None)
403     }
404 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty405     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
406         if self.num_exprs > 1 || self.compiled.is_dfa {
407             // Don't ever compile Save instructions for regex sets because
408             // they are never used. They are also never used in DFA programs
409             // because DFAs can't handle captures.
410             self.c(expr)
411         } else {
412             let entry = self.insts.len();
413             let hole = self.push_hole(InstHole::Save { slot: first_slot });
414             let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
415             self.fill(hole, patch.entry);
416             self.fill_to_next(patch.hole);
417             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
418             Ok(Some(Patch { hole, entry }))
419         }
420     }
421 
c_dotstar(&mut self) -> Result422     fn c_dotstar(&mut self) -> Result {
423         Ok(if !self.compiled.only_utf8() {
424             self.c(&Hir::repetition(hir::Repetition {
425                 kind: hir::RepetitionKind::ZeroOrMore,
426                 greedy: false,
427                 hir: Box::new(Hir::any(true)),
428             }))?
429             .unwrap()
430         } else {
431             self.c(&Hir::repetition(hir::Repetition {
432                 kind: hir::RepetitionKind::ZeroOrMore,
433                 greedy: false,
434                 hir: Box::new(Hir::any(false)),
435             }))?
436             .unwrap()
437         })
438     }
439 
c_char(&mut self, c: char) -> ResultOrEmpty440     fn c_char(&mut self, c: char) -> ResultOrEmpty {
441         if self.compiled.uses_bytes() {
442             if c.is_ascii() {
443                 let b = c as u8;
444                 let hole =
445                     self.push_hole(InstHole::Bytes { start: b, end: b });
446                 self.byte_classes.set_range(b, b);
447                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
448             } else {
449                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
450             }
451         } else {
452             let hole = self.push_hole(InstHole::Char { c });
453             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
454         }
455     }
456 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty457     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
458         use std::mem::size_of;
459 
460         assert!(!ranges.is_empty());
461         if self.compiled.uses_bytes() {
462             Ok(Some(CompileClass { c: self, ranges }.compile()?))
463         } else {
464             let ranges: Vec<(char, char)> =
465                 ranges.iter().map(|r| (r.start(), r.end())).collect();
466             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
467                 self.push_hole(InstHole::Char { c: ranges[0].0 })
468             } else {
469                 self.extra_inst_bytes +=
470                     ranges.len() * (size_of::<char>() * 2);
471                 self.push_hole(InstHole::Ranges { ranges })
472             };
473             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
474         }
475     }
476 
c_byte(&mut self, b: u8) -> ResultOrEmpty477     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
478         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
479     }
480 
c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty481     fn c_class_bytes(
482         &mut self,
483         ranges: &[hir::ClassBytesRange],
484     ) -> ResultOrEmpty {
485         debug_assert!(!ranges.is_empty());
486 
487         let first_split_entry = self.insts.len();
488         let mut holes = vec![];
489         let mut prev_hole = Hole::None;
490         for r in &ranges[0..ranges.len() - 1] {
491             self.fill_to_next(prev_hole);
492             let split = self.push_split_hole();
493             let next = self.insts.len();
494             self.byte_classes.set_range(r.start(), r.end());
495             holes.push(self.push_hole(InstHole::Bytes {
496                 start: r.start(),
497                 end: r.end(),
498             }));
499             prev_hole = self.fill_split(split, Some(next), None);
500         }
501         let next = self.insts.len();
502         let r = &ranges[ranges.len() - 1];
503         self.byte_classes.set_range(r.start(), r.end());
504         holes.push(
505             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
506         );
507         self.fill(prev_hole, next);
508         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
509     }
510 
c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty511     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
512         let hole = self.push_hole(InstHole::EmptyLook { look });
513         Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
514     }
515 
c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>,516     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
517     where
518         I: IntoIterator<Item = &'a Hir>,
519     {
520         let mut exprs = exprs.into_iter();
521         let Patch { mut hole, entry } = loop {
522             match exprs.next() {
523                 None => return self.c_empty(),
524                 Some(e) => {
525                     if let Some(p) = self.c(e)? {
526                         break p;
527                     }
528                 }
529             }
530         };
531         for e in exprs {
532             if let Some(p) = self.c(e)? {
533                 self.fill(hole, p.entry);
534                 hole = p.hole;
535             }
536         }
537         Ok(Some(Patch { hole, entry }))
538     }
539 
c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty540     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
541         debug_assert!(
542             exprs.len() >= 2,
543             "alternates must have at least 2 exprs"
544         );
545 
546         // Initial entry point is always the first split.
547         let first_split_entry = self.insts.len();
548 
549         // Save up all of the holes from each alternate. They will all get
550         // patched to point to the same location.
551         let mut holes = vec![];
552 
553         // true indicates that the hole is a split where we want to fill
554         // the second branch.
555         let mut prev_hole = (Hole::None, false);
556         for e in &exprs[0..exprs.len() - 1] {
557             if prev_hole.1 {
558                 let next = self.insts.len();
559                 self.fill_split(prev_hole.0, None, Some(next));
560             } else {
561                 self.fill_to_next(prev_hole.0);
562             }
563             let split = self.push_split_hole();
564             if let Some(Patch { hole, entry }) = self.c(e)? {
565                 holes.push(hole);
566                 prev_hole = (self.fill_split(split, Some(entry), None), false);
567             } else {
568                 let (split1, split2) = split.dup_one();
569                 holes.push(split1);
570                 prev_hole = (split2, true);
571             }
572         }
573         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
574             holes.push(hole);
575             if prev_hole.1 {
576                 self.fill_split(prev_hole.0, None, Some(entry));
577             } else {
578                 self.fill(prev_hole.0, entry);
579             }
580         } else {
581             // We ignore prev_hole.1. When it's true, it means we have two
582             // empty branches both pushing prev_hole.0 into holes, so both
583             // branches will go to the same place anyway.
584             holes.push(prev_hole.0);
585         }
586         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
587     }
588 
c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty589     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
590         use regex_syntax::hir::RepetitionKind::*;
591         match rep.kind {
592             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
593             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
594             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
595             Range(hir::RepetitionRange::Exactly(min_max)) => {
596                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
597             }
598             Range(hir::RepetitionRange::AtLeast(min)) => {
599                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
600             }
601             Range(hir::RepetitionRange::Bounded(min, max)) => {
602                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
603             }
604         }
605     }
606 
c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty607     fn c_repeat_zero_or_one(
608         &mut self,
609         expr: &Hir,
610         greedy: bool,
611     ) -> ResultOrEmpty {
612         let split_entry = self.insts.len();
613         let split = self.push_split_hole();
614         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615             Some(p) => p,
616             None => return self.pop_split_hole(),
617         };
618         let split_hole = if greedy {
619             self.fill_split(split, Some(entry_rep), None)
620         } else {
621             self.fill_split(split, None, Some(entry_rep))
622         };
623         let holes = vec![hole_rep, split_hole];
624         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
625     }
626 
c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty627     fn c_repeat_zero_or_more(
628         &mut self,
629         expr: &Hir,
630         greedy: bool,
631     ) -> ResultOrEmpty {
632         let split_entry = self.insts.len();
633         let split = self.push_split_hole();
634         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
635             Some(p) => p,
636             None => return self.pop_split_hole(),
637         };
638 
639         self.fill(hole_rep, split_entry);
640         let split_hole = if greedy {
641             self.fill_split(split, Some(entry_rep), None)
642         } else {
643             self.fill_split(split, None, Some(entry_rep))
644         };
645         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
646     }
647 
c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty648     fn c_repeat_one_or_more(
649         &mut self,
650         expr: &Hir,
651         greedy: bool,
652     ) -> ResultOrEmpty {
653         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
654             Some(p) => p,
655             None => return Ok(None),
656         };
657         self.fill_to_next(hole_rep);
658         let split = self.push_split_hole();
659 
660         let split_hole = if greedy {
661             self.fill_split(split, Some(entry_rep), None)
662         } else {
663             self.fill_split(split, None, Some(entry_rep))
664         };
665         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
666     }
667 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty668     fn c_repeat_range_min_or_more(
669         &mut self,
670         expr: &Hir,
671         greedy: bool,
672         min: u32,
673     ) -> ResultOrEmpty {
674         let min = u32_to_usize(min);
675         // Using next_inst() is ok, because we can't return it (concat would
676         // have to return Some(_) while c_repeat_range_min_or_more returns
677         // None).
678         let patch_concat = self
679             .c_concat(iter::repeat(expr).take(min))?
680             .unwrap_or_else(|| self.next_inst());
681         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
682             self.fill(patch_concat.hole, patch_rep.entry);
683             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
684         } else {
685             Ok(None)
686         }
687     }
688 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty689     fn c_repeat_range(
690         &mut self,
691         expr: &Hir,
692         greedy: bool,
693         min: u32,
694         max: u32,
695     ) -> ResultOrEmpty {
696         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
697         debug_assert!(min <= max);
698         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
699         if min == max {
700             return Ok(patch_concat);
701         }
702         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
703         // max at this point).
704         let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
705         let initial_entry = patch_concat.entry;
706         // It is much simpler to compile, e.g., `a{2,5}` as:
707         //
708         //     aaa?a?a?
709         //
710         // But you end up with a sequence of instructions like this:
711         //
712         //     0: 'a'
713         //     1: 'a',
714         //     2: split(3, 4)
715         //     3: 'a'
716         //     4: split(5, 6)
717         //     5: 'a'
718         //     6: split(7, 8)
719         //     7: 'a'
720         //     8: MATCH
721         //
722         // This is *incredibly* inefficient because the splits end
723         // up forming a chain, which has to be resolved everything a
724         // transition is followed.
725         let mut holes = vec![];
726         let mut prev_hole = patch_concat.hole;
727         for _ in min..max {
728             self.fill_to_next(prev_hole);
729             let split = self.push_split_hole();
730             let Patch { hole, entry } = match self.c(expr)? {
731                 Some(p) => p,
732                 None => return self.pop_split_hole(),
733             };
734             prev_hole = hole;
735             if greedy {
736                 holes.push(self.fill_split(split, Some(entry), None));
737             } else {
738                 holes.push(self.fill_split(split, None, Some(entry)));
739             }
740         }
741         holes.push(prev_hole);
742         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
743     }
744 
745     /// Can be used as a default value for the c_* functions when the call to
746     /// c_function is followed by inserting at least one instruction that is
747     /// always executed after the ones written by the c* function.
next_inst(&self) -> Patch748     fn next_inst(&self) -> Patch {
749         Patch { hole: Hole::None, entry: self.insts.len() }
750     }
751 
fill(&mut self, hole: Hole, goto: InstPtr)752     fn fill(&mut self, hole: Hole, goto: InstPtr) {
753         match hole {
754             Hole::None => {}
755             Hole::One(pc) => {
756                 self.insts[pc].fill(goto);
757             }
758             Hole::Many(holes) => {
759                 for hole in holes {
760                     self.fill(hole, goto);
761                 }
762             }
763         }
764     }
765 
fill_to_next(&mut self, hole: Hole)766     fn fill_to_next(&mut self, hole: Hole) {
767         let next = self.insts.len();
768         self.fill(hole, next);
769     }
770 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole771     fn fill_split(
772         &mut self,
773         hole: Hole,
774         goto1: Option<InstPtr>,
775         goto2: Option<InstPtr>,
776     ) -> Hole {
777         match hole {
778             Hole::None => Hole::None,
779             Hole::One(pc) => match (goto1, goto2) {
780                 (Some(goto1), Some(goto2)) => {
781                     self.insts[pc].fill_split(goto1, goto2);
782                     Hole::None
783                 }
784                 (Some(goto1), None) => {
785                     self.insts[pc].half_fill_split_goto1(goto1);
786                     Hole::One(pc)
787                 }
788                 (None, Some(goto2)) => {
789                     self.insts[pc].half_fill_split_goto2(goto2);
790                     Hole::One(pc)
791                 }
792                 (None, None) => unreachable!(
793                     "at least one of the split \
794                      holes must be filled"
795                 ),
796             },
797             Hole::Many(holes) => {
798                 let mut new_holes = vec![];
799                 for hole in holes {
800                     new_holes.push(self.fill_split(hole, goto1, goto2));
801                 }
802                 if new_holes.is_empty() {
803                     Hole::None
804                 } else if new_holes.len() == 1 {
805                     new_holes.pop().unwrap()
806                 } else {
807                     Hole::Many(new_holes)
808                 }
809             }
810         }
811     }
812 
push_compiled(&mut self, inst: Inst)813     fn push_compiled(&mut self, inst: Inst) {
814         self.insts.push(MaybeInst::Compiled(inst));
815     }
816 
push_hole(&mut self, inst: InstHole) -> Hole817     fn push_hole(&mut self, inst: InstHole) -> Hole {
818         let hole = self.insts.len();
819         self.insts.push(MaybeInst::Uncompiled(inst));
820         Hole::One(hole)
821     }
822 
push_split_hole(&mut self) -> Hole823     fn push_split_hole(&mut self) -> Hole {
824         let hole = self.insts.len();
825         self.insts.push(MaybeInst::Split);
826         Hole::One(hole)
827     }
828 
pop_split_hole(&mut self) -> ResultOrEmpty829     fn pop_split_hole(&mut self) -> ResultOrEmpty {
830         self.insts.pop();
831         Ok(None)
832     }
833 
check_size(&self) -> result::Result<(), Error>834     fn check_size(&self) -> result::Result<(), Error> {
835         use std::mem::size_of;
836 
837         let size =
838             self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
839         if size > self.size_limit {
840             Err(Error::CompiledTooBig(self.size_limit))
841         } else {
842             Ok(())
843         }
844     }
845 }
846 
847 #[derive(Debug)]
848 enum Hole {
849     None,
850     One(InstPtr),
851     Many(Vec<Hole>),
852 }
853 
854 impl Hole {
dup_one(self) -> (Self, Self)855     fn dup_one(self) -> (Self, Self) {
856         match self {
857             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
858             Hole::None | Hole::Many(_) => {
859                 unreachable!("must be called on single hole")
860             }
861         }
862     }
863 }
864 
865 #[derive(Clone, Debug)]
866 enum MaybeInst {
867     Compiled(Inst),
868     Uncompiled(InstHole),
869     Split,
870     Split1(InstPtr),
871     Split2(InstPtr),
872 }
873 
874 impl MaybeInst {
fill(&mut self, goto: InstPtr)875     fn fill(&mut self, goto: InstPtr) {
876         let maybeinst = match *self {
877             MaybeInst::Split => MaybeInst::Split1(goto),
878             MaybeInst::Uncompiled(ref inst) => {
879                 MaybeInst::Compiled(inst.fill(goto))
880             }
881             MaybeInst::Split1(goto1) => {
882                 MaybeInst::Compiled(Inst::Split(InstSplit {
883                     goto1,
884                     goto2: goto,
885                 }))
886             }
887             MaybeInst::Split2(goto2) => {
888                 MaybeInst::Compiled(Inst::Split(InstSplit {
889                     goto1: goto,
890                     goto2,
891                 }))
892             }
893             _ => unreachable!(
894                 "not all instructions were compiled! \
895                  found uncompiled instruction: {:?}",
896                 self
897             ),
898         };
899         *self = maybeinst;
900     }
901 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)902     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
903         let filled = match *self {
904             MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
905             _ => unreachable!(
906                 "must be called on Split instruction, \
907                  instead it was called on: {:?}",
908                 self
909             ),
910         };
911         *self = MaybeInst::Compiled(filled);
912     }
913 
half_fill_split_goto1(&mut self, goto1: InstPtr)914     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
915         let half_filled = match *self {
916             MaybeInst::Split => goto1,
917             _ => unreachable!(
918                 "must be called on Split instruction, \
919                  instead it was called on: {:?}",
920                 self
921             ),
922         };
923         *self = MaybeInst::Split1(half_filled);
924     }
925 
half_fill_split_goto2(&mut self, goto2: InstPtr)926     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
927         let half_filled = match *self {
928             MaybeInst::Split => goto2,
929             _ => unreachable!(
930                 "must be called on Split instruction, \
931                  instead it was called on: {:?}",
932                 self
933             ),
934         };
935         *self = MaybeInst::Split2(half_filled);
936     }
937 
unwrap(self) -> Inst938     fn unwrap(self) -> Inst {
939         match self {
940             MaybeInst::Compiled(inst) => inst,
941             _ => unreachable!(
942                 "must be called on a compiled instruction, \
943                  instead it was called on: {:?}",
944                 self
945             ),
946         }
947     }
948 }
949 
950 #[derive(Clone, Debug)]
951 enum InstHole {
952     Save { slot: usize },
953     EmptyLook { look: EmptyLook },
954     Char { c: char },
955     Ranges { ranges: Vec<(char, char)> },
956     Bytes { start: u8, end: u8 },
957 }
958 
959 impl InstHole {
fill(&self, goto: InstPtr) -> Inst960     fn fill(&self, goto: InstPtr) -> Inst {
961         match *self {
962             InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
963             InstHole::EmptyLook { look } => {
964                 Inst::EmptyLook(InstEmptyLook { goto, look })
965             }
966             InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
967             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
968                 goto,
969                 ranges: ranges.clone().into_boxed_slice(),
970             }),
971             InstHole::Bytes { start, end } => {
972                 Inst::Bytes(InstBytes { goto, start, end })
973             }
974         }
975     }
976 }
977 
978 struct CompileClass<'a, 'b> {
979     c: &'a mut Compiler,
980     ranges: &'b [hir::ClassUnicodeRange],
981 }
982 
983 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result984     fn compile(mut self) -> Result {
985         let mut holes = vec![];
986         let mut initial_entry = None;
987         let mut last_split = Hole::None;
988         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
989         self.c.suffix_cache.clear();
990 
991         for (i, range) in self.ranges.iter().enumerate() {
992             let is_last_range = i + 1 == self.ranges.len();
993             utf8_seqs.reset(range.start(), range.end());
994             let mut it = (&mut utf8_seqs).peekable();
995             loop {
996                 let utf8_seq = match it.next() {
997                     None => break,
998                     Some(utf8_seq) => utf8_seq,
999                 };
1000                 if is_last_range && it.peek().is_none() {
1001                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1002                     holes.push(hole);
1003                     self.c.fill(last_split, entry);
1004                     last_split = Hole::None;
1005                     if initial_entry.is_none() {
1006                         initial_entry = Some(entry);
1007                     }
1008                 } else {
1009                     if initial_entry.is_none() {
1010                         initial_entry = Some(self.c.insts.len());
1011                     }
1012                     self.c.fill_to_next(last_split);
1013                     last_split = self.c.push_split_hole();
1014                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1015                     holes.push(hole);
1016                     last_split =
1017                         self.c.fill_split(last_split, Some(entry), None);
1018                 }
1019             }
1020         }
1021         self.c.utf8_seqs = Some(utf8_seqs);
1022         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1023     }
1024 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result1025     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1026         if self.c.compiled.is_reverse {
1027             self.c_utf8_seq_(seq)
1028         } else {
1029             self.c_utf8_seq_(seq.into_iter().rev())
1030         }
1031     }
1032 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,1033     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1034     where
1035         I: IntoIterator<Item = &'r Utf8Range>,
1036     {
1037         // The initial instruction for each UTF-8 sequence should be the same.
1038         let mut from_inst = ::std::usize::MAX;
1039         let mut last_hole = Hole::None;
1040         for byte_range in seq {
1041             let key = SuffixCacheKey {
1042                 from_inst,
1043                 start: byte_range.start,
1044                 end: byte_range.end,
1045             };
1046             {
1047                 let pc = self.c.insts.len();
1048                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1049                     from_inst = cached_pc;
1050                     continue;
1051                 }
1052             }
1053             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1054             if from_inst == ::std::usize::MAX {
1055                 last_hole = self.c.push_hole(InstHole::Bytes {
1056                     start: byte_range.start,
1057                     end: byte_range.end,
1058                 });
1059             } else {
1060                 self.c.push_compiled(Inst::Bytes(InstBytes {
1061                     goto: from_inst,
1062                     start: byte_range.start,
1063                     end: byte_range.end,
1064                 }));
1065             }
1066             from_inst = self.c.insts.len().checked_sub(1).unwrap();
1067             debug_assert!(from_inst < ::std::usize::MAX);
1068         }
1069         debug_assert!(from_inst < ::std::usize::MAX);
1070         Ok(Patch { hole: last_hole, entry: from_inst })
1071     }
1072 }
1073 
1074 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1075 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1076 /// The set of byte ranges looks like this:
1077 ///
1078 /// [0-7F]
1079 /// [C2-DF][80-BF]
1080 /// [E0][A0-BF][80-BF]
1081 /// [E1-EC][80-BF][80-BF]
1082 /// [ED][80-9F][80-BF]
1083 /// [EE-EF][80-BF][80-BF]
1084 ///
1085 /// Each line above translates to one alternate in the compiled regex program.
1086 /// However, all but one of the alternates end in the same suffix, which is
1087 /// a waste of an instruction. The suffix cache facilitates reusing them across
1088 /// alternates.
1089 ///
1090 /// Note that a HashMap could be trivially used for this, but we don't need its
1091 /// overhead. Some small bounded space (LRU style) is more than enough.
1092 ///
1093 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1094 /// except it uses hashes as original indices and then compares full keys for
1095 /// validation against `dense` array.
1096 #[derive(Debug)]
1097 struct SuffixCache {
1098     sparse: Box<[usize]>,
1099     dense: Vec<SuffixCacheEntry>,
1100 }
1101 
1102 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1103 struct SuffixCacheEntry {
1104     key: SuffixCacheKey,
1105     pc: InstPtr,
1106 }
1107 
1108 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1109 struct SuffixCacheKey {
1110     from_inst: InstPtr,
1111     start: u8,
1112     end: u8,
1113 }
1114 
1115 impl SuffixCache {
new(size: usize) -> Self1116     fn new(size: usize) -> Self {
1117         SuffixCache {
1118             sparse: vec![0usize; size].into(),
1119             dense: Vec::with_capacity(size),
1120         }
1121     }
1122 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1123     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1124         let hash = self.hash(&key);
1125         let pos = &mut self.sparse[hash];
1126         if let Some(entry) = self.dense.get(*pos) {
1127             if entry.key == key {
1128                 return Some(entry.pc);
1129             }
1130         }
1131         *pos = self.dense.len();
1132         self.dense.push(SuffixCacheEntry { key, pc });
1133         None
1134     }
1135 
clear(&mut self)1136     fn clear(&mut self) {
1137         self.dense.clear();
1138     }
1139 
hash(&self, suffix: &SuffixCacheKey) -> usize1140     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1141         // Basic FNV-1a hash as described:
1142         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1143         const FNV_PRIME: u64 = 1_099_511_628_211;
1144         let mut h = 14_695_981_039_346_656_037;
1145         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1146         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1147         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1148         (h as usize) % self.sparse.len()
1149     }
1150 }
1151 
1152 struct ByteClassSet([bool; 256]);
1153 
1154 impl ByteClassSet {
new() -> Self1155     fn new() -> Self {
1156         ByteClassSet([false; 256])
1157     }
1158 
set_range(&mut self, start: u8, end: u8)1159     fn set_range(&mut self, start: u8, end: u8) {
1160         debug_assert!(start <= end);
1161         if start > 0 {
1162             self.0[start as usize - 1] = true;
1163         }
1164         self.0[end as usize] = true;
1165     }
1166 
set_word_boundary(&mut self)1167     fn set_word_boundary(&mut self) {
1168         // We need to mark all ranges of bytes whose pairs result in
1169         // evaluating \b differently.
1170         let iswb = is_word_byte;
1171         let mut b1: u16 = 0;
1172         let mut b2: u16;
1173         while b1 <= 255 {
1174             b2 = b1 + 1;
1175             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1176                 b2 += 1;
1177             }
1178             self.set_range(b1 as u8, (b2 - 1) as u8);
1179             b1 = b2;
1180         }
1181     }
1182 
byte_classes(&self) -> Vec<u8>1183     fn byte_classes(&self) -> Vec<u8> {
1184         // N.B. If you're debugging the DFA, it's useful to simply return
1185         // `(0..256).collect()`, which effectively removes the byte classes
1186         // and makes the transitions easier to read.
1187         // (0usize..256).map(|x| x as u8).collect()
1188         let mut byte_classes = vec![0; 256];
1189         let mut class = 0u8;
1190         let mut i = 0;
1191         loop {
1192             byte_classes[i] = class as u8;
1193             if i >= 255 {
1194                 break;
1195             }
1196             if self.0[i] {
1197                 class = class.checked_add(1).unwrap();
1198             }
1199             i += 1;
1200         }
1201         byte_classes
1202     }
1203 }
1204 
1205 impl fmt::Debug for ByteClassSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1206     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1207         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1208     }
1209 }
1210 
u32_to_usize(n: u32) -> usize1211 fn u32_to_usize(n: u32) -> usize {
1212     // In case usize is less than 32 bits, we need to guard against overflow.
1213     // On most platforms this compiles to nothing.
1214     // TODO Use `std::convert::TryFrom` once it's stable.
1215     if (n as u64) > (::std::usize::MAX as u64) {
1216         panic!("BUG: {} is too big to be pointer sized", n)
1217     }
1218     n as usize
1219 }
1220 
1221 #[cfg(test)]
1222 mod tests {
1223     use super::ByteClassSet;
1224 
1225     #[test]
byte_classes()1226     fn byte_classes() {
1227         let mut set = ByteClassSet::new();
1228         set.set_range(b'a', b'z');
1229         let classes = set.byte_classes();
1230         assert_eq!(classes[0], 0);
1231         assert_eq!(classes[1], 0);
1232         assert_eq!(classes[2], 0);
1233         assert_eq!(classes[b'a' as usize - 1], 0);
1234         assert_eq!(classes[b'a' as usize], 1);
1235         assert_eq!(classes[b'm' as usize], 1);
1236         assert_eq!(classes[b'z' as usize], 1);
1237         assert_eq!(classes[b'z' as usize + 1], 2);
1238         assert_eq!(classes[254], 2);
1239         assert_eq!(classes[255], 2);
1240 
1241         let mut set = ByteClassSet::new();
1242         set.set_range(0, 2);
1243         set.set_range(4, 6);
1244         let classes = set.byte_classes();
1245         assert_eq!(classes[0], 0);
1246         assert_eq!(classes[1], 0);
1247         assert_eq!(classes[2], 0);
1248         assert_eq!(classes[3], 1);
1249         assert_eq!(classes[4], 2);
1250         assert_eq!(classes[5], 2);
1251         assert_eq!(classes[6], 2);
1252         assert_eq!(classes[7], 3);
1253         assert_eq!(classes[255], 3);
1254     }
1255 
1256     #[test]
full_byte_classes()1257     fn full_byte_classes() {
1258         let mut set = ByteClassSet::new();
1259         for i in 0..256u16 {
1260             set.set_range(i as u8, i as u8);
1261         }
1262         assert_eq!(set.byte_classes().len(), 256);
1263     }
1264 }
1265