1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6
7 use regex_syntax::hir::{self, Hir};
8 use regex_syntax::is_word_byte;
9 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11 use crate::prog::{
12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13 InstSave, InstSplit, Program,
14 };
15
16 use crate::Error;
17
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21 #[derive(Debug)]
22 struct Patch {
23 hole: Hole,
24 entry: InstPtr,
25 }
26
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33 insts: Vec<MaybeInst>,
34 compiled: Program,
35 capture_name_idx: HashMap<String, usize>,
36 num_exprs: usize,
37 size_limit: usize,
38 suffix_cache: SuffixCache,
39 utf8_seqs: Option<Utf8Sequences>,
40 byte_classes: ByteClassSet,
41 // This keeps track of extra bytes allocated while compiling the regex
42 // program. Currently, this corresponds to two things. First is the heap
43 // memory allocated by Unicode character classes ('InstRanges'). Second is
44 // a "fake" amount of memory used by empty sub-expressions, so that enough
45 // empty sub-expressions will ultimately trigger the compiler to bail
46 // because of a size limit restriction. (That empty sub-expressions don't
47 // add to heap memory usage is more-or-less an implementation detail.) In
48 // the second case, if we don't bail, then an excessively large repetition
49 // on an empty sub-expression can result in the compiler using a very large
50 // amount of CPU time.
51 extra_inst_bytes: usize,
52 }
53
54 impl Compiler {
55 /// Create a new regular expression compiler.
56 ///
57 /// Various options can be set before calling `compile` on an expression.
new() -> Self58 pub fn new() -> Self {
59 Compiler {
60 insts: vec![],
61 compiled: Program::new(),
62 capture_name_idx: HashMap::new(),
63 num_exprs: 0,
64 size_limit: 10 * (1 << 20),
65 suffix_cache: SuffixCache::new(1000),
66 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67 byte_classes: ByteClassSet::new(),
68 extra_inst_bytes: 0,
69 }
70 }
71
72 /// The size of the resulting program is limited by size_limit. If
73 /// the program approximately exceeds the given size (in bytes), then
74 /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self75 pub fn size_limit(mut self, size_limit: usize) -> Self {
76 self.size_limit = size_limit;
77 self
78 }
79
80 /// If bytes is true, then the program is compiled as a byte based
81 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82 /// false, then the automaton is Unicode scalar value based, e.g., an
83 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84 ///
85 /// The specific invariant is that when returning a byte based machine,
86 /// the neither the `Char` nor `Ranges` instructions are produced.
87 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88 /// instruction is never produced.
89 ///
90 /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self91 pub fn bytes(mut self, yes: bool) -> Self {
92 self.compiled.is_bytes = yes;
93 self
94 }
95
96 /// When disabled, the program compiled may match arbitrary bytes.
97 ///
98 /// When enabled (the default), all compiled programs exclusively match
99 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self100 pub fn only_utf8(mut self, yes: bool) -> Self {
101 self.compiled.only_utf8 = yes;
102 self
103 }
104
105 /// When set, the machine returned is suitable for use in the DFA matching
106 /// engine.
107 ///
108 /// In particular, this ensures that if the regex is not anchored in the
109 /// beginning, then a preceding `.*?` is included in the program. (The NFA
110 /// based engines handle the preceding `.*?` explicitly, which is difficult
111 /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self112 pub fn dfa(mut self, yes: bool) -> Self {
113 self.compiled.is_dfa = yes;
114 self
115 }
116
117 /// When set, the machine returned is suitable for matching text in
118 /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self119 pub fn reverse(mut self, yes: bool) -> Self {
120 self.compiled.is_reverse = yes;
121 self
122 }
123
124 /// Compile a regular expression given its AST.
125 ///
126 /// The compiler is guaranteed to succeed unless the program exceeds the
127 /// specified size limit. If the size limit is exceeded, then compilation
128 /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>129 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130 debug_assert!(!exprs.is_empty());
131 self.num_exprs = exprs.len();
132 if exprs.len() == 1 {
133 self.compile_one(&exprs[0])
134 } else {
135 self.compile_many(exprs)
136 }
137 }
138
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>139 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140 // If we're compiling a forward DFA and we aren't anchored, then
141 // add a `.*?` before the first capture group.
142 // Other matching engines handle this by baking the logic into the
143 // matching engine itself.
144 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145 self.compiled.is_anchored_start = expr.is_anchored_start();
146 self.compiled.is_anchored_end = expr.is_anchored_end();
147 if self.compiled.needs_dotstar() {
148 dotstar_patch = self.c_dotstar()?;
149 self.compiled.start = dotstar_patch.entry;
150 }
151 self.compiled.captures = vec![None];
152 let patch =
153 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
154 if self.compiled.needs_dotstar() {
155 self.fill(dotstar_patch.hole, patch.entry);
156 } else {
157 self.compiled.start = patch.entry;
158 }
159 self.fill_to_next(patch.hole);
160 self.compiled.matches = vec![self.insts.len()];
161 self.push_compiled(Inst::Match(0));
162 self.compile_finish()
163 }
164
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>165 fn compile_many(
166 mut self,
167 exprs: &[Hir],
168 ) -> result::Result<Program, Error> {
169 debug_assert!(exprs.len() > 1);
170
171 self.compiled.is_anchored_start =
172 exprs.iter().all(|e| e.is_anchored_start());
173 self.compiled.is_anchored_end =
174 exprs.iter().all(|e| e.is_anchored_end());
175 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
176 if self.compiled.needs_dotstar() {
177 dotstar_patch = self.c_dotstar()?;
178 self.compiled.start = dotstar_patch.entry;
179 } else {
180 self.compiled.start = 0; // first instruction is always split
181 }
182 self.fill_to_next(dotstar_patch.hole);
183
184 let mut prev_hole = Hole::None;
185 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
186 self.fill_to_next(prev_hole);
187 let split = self.push_split_hole();
188 let Patch { hole, entry } =
189 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
190 self.fill_to_next(hole);
191 self.compiled.matches.push(self.insts.len());
192 self.push_compiled(Inst::Match(i));
193 prev_hole = self.fill_split(split, Some(entry), None);
194 }
195 let i = exprs.len() - 1;
196 let Patch { hole, entry } =
197 self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
198 self.fill(prev_hole, entry);
199 self.fill_to_next(hole);
200 self.compiled.matches.push(self.insts.len());
201 self.push_compiled(Inst::Match(i));
202 self.compile_finish()
203 }
204
compile_finish(mut self) -> result::Result<Program, Error>205 fn compile_finish(mut self) -> result::Result<Program, Error> {
206 self.compiled.insts =
207 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
208 self.compiled.byte_classes = self.byte_classes.byte_classes();
209 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
210 Ok(self.compiled)
211 }
212
213 /// Compile expr into self.insts, returning a patch on success,
214 /// or an error if we run out of memory.
215 ///
216 /// All of the c_* methods of the compiler share the contract outlined
217 /// here.
218 ///
219 /// The main thing that a c_* method does is mutate `self.insts`
220 /// to add a list of mostly compiled instructions required to execute
221 /// the given expression. `self.insts` contains MaybeInsts rather than
222 /// Insts because there is some backpatching required.
223 ///
224 /// The `Patch` value returned by each c_* method provides metadata
225 /// about the compiled instructions emitted to `self.insts`. The
226 /// `entry` member of the patch refers to the first instruction
227 /// (the entry point), while the `hole` member contains zero or
228 /// more offsets to partial instructions that need to be backpatched.
229 /// The c_* routine can't know where its list of instructions are going to
230 /// jump to after execution, so it is up to the caller to patch
231 /// these jumps to point to the right place. So compiling some
232 /// expression, e, we would end up with a situation that looked like:
233 ///
234 /// ```text
235 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
236 /// ^ ^ ^
237 /// | \ /
238 /// entry \ /
239 /// hole
240 /// ```
241 ///
242 /// To compile two expressions, e1 and e2, concatenated together we
243 /// would do:
244 ///
245 /// ```ignore
246 /// let patch1 = self.c(e1);
247 /// let patch2 = self.c(e2);
248 /// ```
249 ///
250 /// while leaves us with a situation that looks like
251 ///
252 /// ```text
253 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
254 /// ^ ^ ^ ^
255 /// | | | |
256 /// entry1 hole1 entry2 hole2
257 /// ```
258 ///
259 /// Then to merge the two patches together into one we would backpatch
260 /// hole1 with entry2 and return a new patch that enters at entry1
261 /// and has hole2 for a hole. In fact, if you look at the c_concat
262 /// method you will see that it does exactly this, though it handles
263 /// a list of expressions rather than just the two that we use for
264 /// an example.
265 ///
266 /// Ok(None) is returned when an expression is compiled to no
267 /// instruction, and so no patch.entry value makes sense.
c(&mut self, expr: &Hir) -> ResultOrEmpty268 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
269 use crate::prog;
270 use regex_syntax::hir::HirKind::*;
271
272 self.check_size()?;
273 match *expr.kind() {
274 Empty => self.c_empty(),
275 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
276 Literal(hir::Literal::Byte(b)) => {
277 assert!(self.compiled.uses_bytes());
278 self.c_byte(b)
279 }
280 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
281 Class(hir::Class::Bytes(ref cls)) => {
282 if self.compiled.uses_bytes() {
283 self.c_class_bytes(cls.ranges())
284 } else {
285 assert!(cls.is_all_ascii());
286 let mut char_ranges = vec![];
287 for r in cls.iter() {
288 let (s, e) = (r.start() as char, r.end() as char);
289 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
290 }
291 self.c_class(&char_ranges)
292 }
293 }
294 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
295 self.byte_classes.set_range(b'\n', b'\n');
296 self.c_empty_look(prog::EmptyLook::EndLine)
297 }
298 Anchor(hir::Anchor::StartLine) => {
299 self.byte_classes.set_range(b'\n', b'\n');
300 self.c_empty_look(prog::EmptyLook::StartLine)
301 }
302 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
303 self.byte_classes.set_range(b'\n', b'\n');
304 self.c_empty_look(prog::EmptyLook::StartLine)
305 }
306 Anchor(hir::Anchor::EndLine) => {
307 self.byte_classes.set_range(b'\n', b'\n');
308 self.c_empty_look(prog::EmptyLook::EndLine)
309 }
310 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
311 self.c_empty_look(prog::EmptyLook::EndText)
312 }
313 Anchor(hir::Anchor::StartText) => {
314 self.c_empty_look(prog::EmptyLook::StartText)
315 }
316 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
317 self.c_empty_look(prog::EmptyLook::StartText)
318 }
319 Anchor(hir::Anchor::EndText) => {
320 self.c_empty_look(prog::EmptyLook::EndText)
321 }
322 WordBoundary(hir::WordBoundary::Unicode) => {
323 if !cfg!(feature = "unicode-perl") {
324 return Err(Error::Syntax(
325 "Unicode word boundaries are unavailable when \
326 the unicode-perl feature is disabled"
327 .to_string(),
328 ));
329 }
330 self.compiled.has_unicode_word_boundary = true;
331 self.byte_classes.set_word_boundary();
332 // We also make sure that all ASCII bytes are in a different
333 // class from non-ASCII bytes. Otherwise, it's possible for
334 // ASCII bytes to get lumped into the same class as non-ASCII
335 // bytes. This in turn may cause the lazy DFA to falsely start
336 // when it sees an ASCII byte that maps to a byte class with
337 // non-ASCII bytes. This ensures that never happens.
338 self.byte_classes.set_range(0, 0x7F);
339 self.c_empty_look(prog::EmptyLook::WordBoundary)
340 }
341 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
342 if !cfg!(feature = "unicode-perl") {
343 return Err(Error::Syntax(
344 "Unicode word boundaries are unavailable when \
345 the unicode-perl feature is disabled"
346 .to_string(),
347 ));
348 }
349 self.compiled.has_unicode_word_boundary = true;
350 self.byte_classes.set_word_boundary();
351 // See comments above for why we set the ASCII range here.
352 self.byte_classes.set_range(0, 0x7F);
353 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
354 }
355 WordBoundary(hir::WordBoundary::Ascii) => {
356 self.byte_classes.set_word_boundary();
357 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
358 }
359 WordBoundary(hir::WordBoundary::AsciiNegate) => {
360 self.byte_classes.set_word_boundary();
361 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
362 }
363 Group(ref g) => match g.kind {
364 hir::GroupKind::NonCapturing => self.c(&g.hir),
365 hir::GroupKind::CaptureIndex(index) => {
366 if index as usize >= self.compiled.captures.len() {
367 self.compiled.captures.push(None);
368 }
369 self.c_capture(2 * index as usize, &g.hir)
370 }
371 hir::GroupKind::CaptureName { index, ref name } => {
372 if index as usize >= self.compiled.captures.len() {
373 let n = name.to_string();
374 self.compiled.captures.push(Some(n.clone()));
375 self.capture_name_idx.insert(n, index as usize);
376 }
377 self.c_capture(2 * index as usize, &g.hir)
378 }
379 },
380 Concat(ref es) => {
381 if self.compiled.is_reverse {
382 self.c_concat(es.iter().rev())
383 } else {
384 self.c_concat(es)
385 }
386 }
387 Alternation(ref es) => self.c_alternate(&**es),
388 Repetition(ref rep) => self.c_repeat(rep),
389 }
390 }
391
c_empty(&mut self) -> ResultOrEmpty392 fn c_empty(&mut self) -> ResultOrEmpty {
393 // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
394 // See: CVE-2022-24713
395 //
396 // Since 'empty' sub-expressions don't increase the size of
397 // the actual compiled object, we "fake" an increase in its
398 // size so that our 'check_size_limit' routine will eventually
399 // stop compilation if there are too many empty sub-expressions
400 // (e.g., via a large repetition).
401 self.extra_inst_bytes += std::mem::size_of::<Inst>();
402 Ok(None)
403 }
404
c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty405 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
406 if self.num_exprs > 1 || self.compiled.is_dfa {
407 // Don't ever compile Save instructions for regex sets because
408 // they are never used. They are also never used in DFA programs
409 // because DFAs can't handle captures.
410 self.c(expr)
411 } else {
412 let entry = self.insts.len();
413 let hole = self.push_hole(InstHole::Save { slot: first_slot });
414 let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
415 self.fill(hole, patch.entry);
416 self.fill_to_next(patch.hole);
417 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
418 Ok(Some(Patch { hole, entry }))
419 }
420 }
421
c_dotstar(&mut self) -> Result422 fn c_dotstar(&mut self) -> Result {
423 Ok(if !self.compiled.only_utf8() {
424 self.c(&Hir::repetition(hir::Repetition {
425 kind: hir::RepetitionKind::ZeroOrMore,
426 greedy: false,
427 hir: Box::new(Hir::any(true)),
428 }))?
429 .unwrap()
430 } else {
431 self.c(&Hir::repetition(hir::Repetition {
432 kind: hir::RepetitionKind::ZeroOrMore,
433 greedy: false,
434 hir: Box::new(Hir::any(false)),
435 }))?
436 .unwrap()
437 })
438 }
439
c_char(&mut self, c: char) -> ResultOrEmpty440 fn c_char(&mut self, c: char) -> ResultOrEmpty {
441 if self.compiled.uses_bytes() {
442 if c.is_ascii() {
443 let b = c as u8;
444 let hole =
445 self.push_hole(InstHole::Bytes { start: b, end: b });
446 self.byte_classes.set_range(b, b);
447 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
448 } else {
449 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
450 }
451 } else {
452 let hole = self.push_hole(InstHole::Char { c });
453 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
454 }
455 }
456
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty457 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
458 use std::mem::size_of;
459
460 assert!(!ranges.is_empty());
461 if self.compiled.uses_bytes() {
462 Ok(Some(CompileClass { c: self, ranges }.compile()?))
463 } else {
464 let ranges: Vec<(char, char)> =
465 ranges.iter().map(|r| (r.start(), r.end())).collect();
466 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
467 self.push_hole(InstHole::Char { c: ranges[0].0 })
468 } else {
469 self.extra_inst_bytes +=
470 ranges.len() * (size_of::<char>() * 2);
471 self.push_hole(InstHole::Ranges { ranges })
472 };
473 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
474 }
475 }
476
c_byte(&mut self, b: u8) -> ResultOrEmpty477 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
478 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
479 }
480
c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty481 fn c_class_bytes(
482 &mut self,
483 ranges: &[hir::ClassBytesRange],
484 ) -> ResultOrEmpty {
485 debug_assert!(!ranges.is_empty());
486
487 let first_split_entry = self.insts.len();
488 let mut holes = vec![];
489 let mut prev_hole = Hole::None;
490 for r in &ranges[0..ranges.len() - 1] {
491 self.fill_to_next(prev_hole);
492 let split = self.push_split_hole();
493 let next = self.insts.len();
494 self.byte_classes.set_range(r.start(), r.end());
495 holes.push(self.push_hole(InstHole::Bytes {
496 start: r.start(),
497 end: r.end(),
498 }));
499 prev_hole = self.fill_split(split, Some(next), None);
500 }
501 let next = self.insts.len();
502 let r = &ranges[ranges.len() - 1];
503 self.byte_classes.set_range(r.start(), r.end());
504 holes.push(
505 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
506 );
507 self.fill(prev_hole, next);
508 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
509 }
510
c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty511 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
512 let hole = self.push_hole(InstHole::EmptyLook { look });
513 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
514 }
515
c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>,516 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
517 where
518 I: IntoIterator<Item = &'a Hir>,
519 {
520 let mut exprs = exprs.into_iter();
521 let Patch { mut hole, entry } = loop {
522 match exprs.next() {
523 None => return self.c_empty(),
524 Some(e) => {
525 if let Some(p) = self.c(e)? {
526 break p;
527 }
528 }
529 }
530 };
531 for e in exprs {
532 if let Some(p) = self.c(e)? {
533 self.fill(hole, p.entry);
534 hole = p.hole;
535 }
536 }
537 Ok(Some(Patch { hole, entry }))
538 }
539
c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty540 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
541 debug_assert!(
542 exprs.len() >= 2,
543 "alternates must have at least 2 exprs"
544 );
545
546 // Initial entry point is always the first split.
547 let first_split_entry = self.insts.len();
548
549 // Save up all of the holes from each alternate. They will all get
550 // patched to point to the same location.
551 let mut holes = vec![];
552
553 // true indicates that the hole is a split where we want to fill
554 // the second branch.
555 let mut prev_hole = (Hole::None, false);
556 for e in &exprs[0..exprs.len() - 1] {
557 if prev_hole.1 {
558 let next = self.insts.len();
559 self.fill_split(prev_hole.0, None, Some(next));
560 } else {
561 self.fill_to_next(prev_hole.0);
562 }
563 let split = self.push_split_hole();
564 if let Some(Patch { hole, entry }) = self.c(e)? {
565 holes.push(hole);
566 prev_hole = (self.fill_split(split, Some(entry), None), false);
567 } else {
568 let (split1, split2) = split.dup_one();
569 holes.push(split1);
570 prev_hole = (split2, true);
571 }
572 }
573 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
574 holes.push(hole);
575 if prev_hole.1 {
576 self.fill_split(prev_hole.0, None, Some(entry));
577 } else {
578 self.fill(prev_hole.0, entry);
579 }
580 } else {
581 // We ignore prev_hole.1. When it's true, it means we have two
582 // empty branches both pushing prev_hole.0 into holes, so both
583 // branches will go to the same place anyway.
584 holes.push(prev_hole.0);
585 }
586 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
587 }
588
c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty589 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
590 use regex_syntax::hir::RepetitionKind::*;
591 match rep.kind {
592 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
593 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
594 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
595 Range(hir::RepetitionRange::Exactly(min_max)) => {
596 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
597 }
598 Range(hir::RepetitionRange::AtLeast(min)) => {
599 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
600 }
601 Range(hir::RepetitionRange::Bounded(min, max)) => {
602 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
603 }
604 }
605 }
606
c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty607 fn c_repeat_zero_or_one(
608 &mut self,
609 expr: &Hir,
610 greedy: bool,
611 ) -> ResultOrEmpty {
612 let split_entry = self.insts.len();
613 let split = self.push_split_hole();
614 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615 Some(p) => p,
616 None => return self.pop_split_hole(),
617 };
618 let split_hole = if greedy {
619 self.fill_split(split, Some(entry_rep), None)
620 } else {
621 self.fill_split(split, None, Some(entry_rep))
622 };
623 let holes = vec![hole_rep, split_hole];
624 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
625 }
626
c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty627 fn c_repeat_zero_or_more(
628 &mut self,
629 expr: &Hir,
630 greedy: bool,
631 ) -> ResultOrEmpty {
632 let split_entry = self.insts.len();
633 let split = self.push_split_hole();
634 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
635 Some(p) => p,
636 None => return self.pop_split_hole(),
637 };
638
639 self.fill(hole_rep, split_entry);
640 let split_hole = if greedy {
641 self.fill_split(split, Some(entry_rep), None)
642 } else {
643 self.fill_split(split, None, Some(entry_rep))
644 };
645 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
646 }
647
c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty648 fn c_repeat_one_or_more(
649 &mut self,
650 expr: &Hir,
651 greedy: bool,
652 ) -> ResultOrEmpty {
653 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
654 Some(p) => p,
655 None => return Ok(None),
656 };
657 self.fill_to_next(hole_rep);
658 let split = self.push_split_hole();
659
660 let split_hole = if greedy {
661 self.fill_split(split, Some(entry_rep), None)
662 } else {
663 self.fill_split(split, None, Some(entry_rep))
664 };
665 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
666 }
667
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty668 fn c_repeat_range_min_or_more(
669 &mut self,
670 expr: &Hir,
671 greedy: bool,
672 min: u32,
673 ) -> ResultOrEmpty {
674 let min = u32_to_usize(min);
675 // Using next_inst() is ok, because we can't return it (concat would
676 // have to return Some(_) while c_repeat_range_min_or_more returns
677 // None).
678 let patch_concat = self
679 .c_concat(iter::repeat(expr).take(min))?
680 .unwrap_or_else(|| self.next_inst());
681 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
682 self.fill(patch_concat.hole, patch_rep.entry);
683 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
684 } else {
685 Ok(None)
686 }
687 }
688
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty689 fn c_repeat_range(
690 &mut self,
691 expr: &Hir,
692 greedy: bool,
693 min: u32,
694 max: u32,
695 ) -> ResultOrEmpty {
696 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
697 debug_assert!(min <= max);
698 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
699 if min == max {
700 return Ok(patch_concat);
701 }
702 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
703 // max at this point).
704 let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
705 let initial_entry = patch_concat.entry;
706 // It is much simpler to compile, e.g., `a{2,5}` as:
707 //
708 // aaa?a?a?
709 //
710 // But you end up with a sequence of instructions like this:
711 //
712 // 0: 'a'
713 // 1: 'a',
714 // 2: split(3, 4)
715 // 3: 'a'
716 // 4: split(5, 6)
717 // 5: 'a'
718 // 6: split(7, 8)
719 // 7: 'a'
720 // 8: MATCH
721 //
722 // This is *incredibly* inefficient because the splits end
723 // up forming a chain, which has to be resolved everything a
724 // transition is followed.
725 let mut holes = vec![];
726 let mut prev_hole = patch_concat.hole;
727 for _ in min..max {
728 self.fill_to_next(prev_hole);
729 let split = self.push_split_hole();
730 let Patch { hole, entry } = match self.c(expr)? {
731 Some(p) => p,
732 None => return self.pop_split_hole(),
733 };
734 prev_hole = hole;
735 if greedy {
736 holes.push(self.fill_split(split, Some(entry), None));
737 } else {
738 holes.push(self.fill_split(split, None, Some(entry)));
739 }
740 }
741 holes.push(prev_hole);
742 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
743 }
744
745 /// Can be used as a default value for the c_* functions when the call to
746 /// c_function is followed by inserting at least one instruction that is
747 /// always executed after the ones written by the c* function.
next_inst(&self) -> Patch748 fn next_inst(&self) -> Patch {
749 Patch { hole: Hole::None, entry: self.insts.len() }
750 }
751
fill(&mut self, hole: Hole, goto: InstPtr)752 fn fill(&mut self, hole: Hole, goto: InstPtr) {
753 match hole {
754 Hole::None => {}
755 Hole::One(pc) => {
756 self.insts[pc].fill(goto);
757 }
758 Hole::Many(holes) => {
759 for hole in holes {
760 self.fill(hole, goto);
761 }
762 }
763 }
764 }
765
fill_to_next(&mut self, hole: Hole)766 fn fill_to_next(&mut self, hole: Hole) {
767 let next = self.insts.len();
768 self.fill(hole, next);
769 }
770
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole771 fn fill_split(
772 &mut self,
773 hole: Hole,
774 goto1: Option<InstPtr>,
775 goto2: Option<InstPtr>,
776 ) -> Hole {
777 match hole {
778 Hole::None => Hole::None,
779 Hole::One(pc) => match (goto1, goto2) {
780 (Some(goto1), Some(goto2)) => {
781 self.insts[pc].fill_split(goto1, goto2);
782 Hole::None
783 }
784 (Some(goto1), None) => {
785 self.insts[pc].half_fill_split_goto1(goto1);
786 Hole::One(pc)
787 }
788 (None, Some(goto2)) => {
789 self.insts[pc].half_fill_split_goto2(goto2);
790 Hole::One(pc)
791 }
792 (None, None) => unreachable!(
793 "at least one of the split \
794 holes must be filled"
795 ),
796 },
797 Hole::Many(holes) => {
798 let mut new_holes = vec![];
799 for hole in holes {
800 new_holes.push(self.fill_split(hole, goto1, goto2));
801 }
802 if new_holes.is_empty() {
803 Hole::None
804 } else if new_holes.len() == 1 {
805 new_holes.pop().unwrap()
806 } else {
807 Hole::Many(new_holes)
808 }
809 }
810 }
811 }
812
push_compiled(&mut self, inst: Inst)813 fn push_compiled(&mut self, inst: Inst) {
814 self.insts.push(MaybeInst::Compiled(inst));
815 }
816
push_hole(&mut self, inst: InstHole) -> Hole817 fn push_hole(&mut self, inst: InstHole) -> Hole {
818 let hole = self.insts.len();
819 self.insts.push(MaybeInst::Uncompiled(inst));
820 Hole::One(hole)
821 }
822
push_split_hole(&mut self) -> Hole823 fn push_split_hole(&mut self) -> Hole {
824 let hole = self.insts.len();
825 self.insts.push(MaybeInst::Split);
826 Hole::One(hole)
827 }
828
pop_split_hole(&mut self) -> ResultOrEmpty829 fn pop_split_hole(&mut self) -> ResultOrEmpty {
830 self.insts.pop();
831 Ok(None)
832 }
833
check_size(&self) -> result::Result<(), Error>834 fn check_size(&self) -> result::Result<(), Error> {
835 use std::mem::size_of;
836
837 let size =
838 self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
839 if size > self.size_limit {
840 Err(Error::CompiledTooBig(self.size_limit))
841 } else {
842 Ok(())
843 }
844 }
845 }
846
847 #[derive(Debug)]
848 enum Hole {
849 None,
850 One(InstPtr),
851 Many(Vec<Hole>),
852 }
853
854 impl Hole {
dup_one(self) -> (Self, Self)855 fn dup_one(self) -> (Self, Self) {
856 match self {
857 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
858 Hole::None | Hole::Many(_) => {
859 unreachable!("must be called on single hole")
860 }
861 }
862 }
863 }
864
865 #[derive(Clone, Debug)]
866 enum MaybeInst {
867 Compiled(Inst),
868 Uncompiled(InstHole),
869 Split,
870 Split1(InstPtr),
871 Split2(InstPtr),
872 }
873
874 impl MaybeInst {
fill(&mut self, goto: InstPtr)875 fn fill(&mut self, goto: InstPtr) {
876 let maybeinst = match *self {
877 MaybeInst::Split => MaybeInst::Split1(goto),
878 MaybeInst::Uncompiled(ref inst) => {
879 MaybeInst::Compiled(inst.fill(goto))
880 }
881 MaybeInst::Split1(goto1) => {
882 MaybeInst::Compiled(Inst::Split(InstSplit {
883 goto1,
884 goto2: goto,
885 }))
886 }
887 MaybeInst::Split2(goto2) => {
888 MaybeInst::Compiled(Inst::Split(InstSplit {
889 goto1: goto,
890 goto2,
891 }))
892 }
893 _ => unreachable!(
894 "not all instructions were compiled! \
895 found uncompiled instruction: {:?}",
896 self
897 ),
898 };
899 *self = maybeinst;
900 }
901
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)902 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
903 let filled = match *self {
904 MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
905 _ => unreachable!(
906 "must be called on Split instruction, \
907 instead it was called on: {:?}",
908 self
909 ),
910 };
911 *self = MaybeInst::Compiled(filled);
912 }
913
half_fill_split_goto1(&mut self, goto1: InstPtr)914 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
915 let half_filled = match *self {
916 MaybeInst::Split => goto1,
917 _ => unreachable!(
918 "must be called on Split instruction, \
919 instead it was called on: {:?}",
920 self
921 ),
922 };
923 *self = MaybeInst::Split1(half_filled);
924 }
925
half_fill_split_goto2(&mut self, goto2: InstPtr)926 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
927 let half_filled = match *self {
928 MaybeInst::Split => goto2,
929 _ => unreachable!(
930 "must be called on Split instruction, \
931 instead it was called on: {:?}",
932 self
933 ),
934 };
935 *self = MaybeInst::Split2(half_filled);
936 }
937
unwrap(self) -> Inst938 fn unwrap(self) -> Inst {
939 match self {
940 MaybeInst::Compiled(inst) => inst,
941 _ => unreachable!(
942 "must be called on a compiled instruction, \
943 instead it was called on: {:?}",
944 self
945 ),
946 }
947 }
948 }
949
950 #[derive(Clone, Debug)]
951 enum InstHole {
952 Save { slot: usize },
953 EmptyLook { look: EmptyLook },
954 Char { c: char },
955 Ranges { ranges: Vec<(char, char)> },
956 Bytes { start: u8, end: u8 },
957 }
958
959 impl InstHole {
fill(&self, goto: InstPtr) -> Inst960 fn fill(&self, goto: InstPtr) -> Inst {
961 match *self {
962 InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
963 InstHole::EmptyLook { look } => {
964 Inst::EmptyLook(InstEmptyLook { goto, look })
965 }
966 InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
967 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
968 goto,
969 ranges: ranges.clone().into_boxed_slice(),
970 }),
971 InstHole::Bytes { start, end } => {
972 Inst::Bytes(InstBytes { goto, start, end })
973 }
974 }
975 }
976 }
977
978 struct CompileClass<'a, 'b> {
979 c: &'a mut Compiler,
980 ranges: &'b [hir::ClassUnicodeRange],
981 }
982
983 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result984 fn compile(mut self) -> Result {
985 let mut holes = vec![];
986 let mut initial_entry = None;
987 let mut last_split = Hole::None;
988 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
989 self.c.suffix_cache.clear();
990
991 for (i, range) in self.ranges.iter().enumerate() {
992 let is_last_range = i + 1 == self.ranges.len();
993 utf8_seqs.reset(range.start(), range.end());
994 let mut it = (&mut utf8_seqs).peekable();
995 loop {
996 let utf8_seq = match it.next() {
997 None => break,
998 Some(utf8_seq) => utf8_seq,
999 };
1000 if is_last_range && it.peek().is_none() {
1001 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1002 holes.push(hole);
1003 self.c.fill(last_split, entry);
1004 last_split = Hole::None;
1005 if initial_entry.is_none() {
1006 initial_entry = Some(entry);
1007 }
1008 } else {
1009 if initial_entry.is_none() {
1010 initial_entry = Some(self.c.insts.len());
1011 }
1012 self.c.fill_to_next(last_split);
1013 last_split = self.c.push_split_hole();
1014 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1015 holes.push(hole);
1016 last_split =
1017 self.c.fill_split(last_split, Some(entry), None);
1018 }
1019 }
1020 }
1021 self.c.utf8_seqs = Some(utf8_seqs);
1022 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1023 }
1024
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result1025 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1026 if self.c.compiled.is_reverse {
1027 self.c_utf8_seq_(seq)
1028 } else {
1029 self.c_utf8_seq_(seq.into_iter().rev())
1030 }
1031 }
1032
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,1033 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1034 where
1035 I: IntoIterator<Item = &'r Utf8Range>,
1036 {
1037 // The initial instruction for each UTF-8 sequence should be the same.
1038 let mut from_inst = ::std::usize::MAX;
1039 let mut last_hole = Hole::None;
1040 for byte_range in seq {
1041 let key = SuffixCacheKey {
1042 from_inst,
1043 start: byte_range.start,
1044 end: byte_range.end,
1045 };
1046 {
1047 let pc = self.c.insts.len();
1048 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1049 from_inst = cached_pc;
1050 continue;
1051 }
1052 }
1053 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1054 if from_inst == ::std::usize::MAX {
1055 last_hole = self.c.push_hole(InstHole::Bytes {
1056 start: byte_range.start,
1057 end: byte_range.end,
1058 });
1059 } else {
1060 self.c.push_compiled(Inst::Bytes(InstBytes {
1061 goto: from_inst,
1062 start: byte_range.start,
1063 end: byte_range.end,
1064 }));
1065 }
1066 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1067 debug_assert!(from_inst < ::std::usize::MAX);
1068 }
1069 debug_assert!(from_inst < ::std::usize::MAX);
1070 Ok(Patch { hole: last_hole, entry: from_inst })
1071 }
1072 }
1073
1074 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1075 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1076 /// The set of byte ranges looks like this:
1077 ///
1078 /// [0-7F]
1079 /// [C2-DF][80-BF]
1080 /// [E0][A0-BF][80-BF]
1081 /// [E1-EC][80-BF][80-BF]
1082 /// [ED][80-9F][80-BF]
1083 /// [EE-EF][80-BF][80-BF]
1084 ///
1085 /// Each line above translates to one alternate in the compiled regex program.
1086 /// However, all but one of the alternates end in the same suffix, which is
1087 /// a waste of an instruction. The suffix cache facilitates reusing them across
1088 /// alternates.
1089 ///
1090 /// Note that a HashMap could be trivially used for this, but we don't need its
1091 /// overhead. Some small bounded space (LRU style) is more than enough.
1092 ///
1093 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1094 /// except it uses hashes as original indices and then compares full keys for
1095 /// validation against `dense` array.
1096 #[derive(Debug)]
1097 struct SuffixCache {
1098 sparse: Box<[usize]>,
1099 dense: Vec<SuffixCacheEntry>,
1100 }
1101
1102 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1103 struct SuffixCacheEntry {
1104 key: SuffixCacheKey,
1105 pc: InstPtr,
1106 }
1107
1108 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1109 struct SuffixCacheKey {
1110 from_inst: InstPtr,
1111 start: u8,
1112 end: u8,
1113 }
1114
1115 impl SuffixCache {
new(size: usize) -> Self1116 fn new(size: usize) -> Self {
1117 SuffixCache {
1118 sparse: vec![0usize; size].into(),
1119 dense: Vec::with_capacity(size),
1120 }
1121 }
1122
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1123 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1124 let hash = self.hash(&key);
1125 let pos = &mut self.sparse[hash];
1126 if let Some(entry) = self.dense.get(*pos) {
1127 if entry.key == key {
1128 return Some(entry.pc);
1129 }
1130 }
1131 *pos = self.dense.len();
1132 self.dense.push(SuffixCacheEntry { key, pc });
1133 None
1134 }
1135
clear(&mut self)1136 fn clear(&mut self) {
1137 self.dense.clear();
1138 }
1139
hash(&self, suffix: &SuffixCacheKey) -> usize1140 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1141 // Basic FNV-1a hash as described:
1142 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1143 const FNV_PRIME: u64 = 1_099_511_628_211;
1144 let mut h = 14_695_981_039_346_656_037;
1145 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1146 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1147 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1148 (h as usize) % self.sparse.len()
1149 }
1150 }
1151
1152 struct ByteClassSet([bool; 256]);
1153
1154 impl ByteClassSet {
new() -> Self1155 fn new() -> Self {
1156 ByteClassSet([false; 256])
1157 }
1158
set_range(&mut self, start: u8, end: u8)1159 fn set_range(&mut self, start: u8, end: u8) {
1160 debug_assert!(start <= end);
1161 if start > 0 {
1162 self.0[start as usize - 1] = true;
1163 }
1164 self.0[end as usize] = true;
1165 }
1166
set_word_boundary(&mut self)1167 fn set_word_boundary(&mut self) {
1168 // We need to mark all ranges of bytes whose pairs result in
1169 // evaluating \b differently.
1170 let iswb = is_word_byte;
1171 let mut b1: u16 = 0;
1172 let mut b2: u16;
1173 while b1 <= 255 {
1174 b2 = b1 + 1;
1175 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1176 b2 += 1;
1177 }
1178 self.set_range(b1 as u8, (b2 - 1) as u8);
1179 b1 = b2;
1180 }
1181 }
1182
byte_classes(&self) -> Vec<u8>1183 fn byte_classes(&self) -> Vec<u8> {
1184 // N.B. If you're debugging the DFA, it's useful to simply return
1185 // `(0..256).collect()`, which effectively removes the byte classes
1186 // and makes the transitions easier to read.
1187 // (0usize..256).map(|x| x as u8).collect()
1188 let mut byte_classes = vec![0; 256];
1189 let mut class = 0u8;
1190 let mut i = 0;
1191 loop {
1192 byte_classes[i] = class as u8;
1193 if i >= 255 {
1194 break;
1195 }
1196 if self.0[i] {
1197 class = class.checked_add(1).unwrap();
1198 }
1199 i += 1;
1200 }
1201 byte_classes
1202 }
1203 }
1204
1205 impl fmt::Debug for ByteClassSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1206 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1207 f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1208 }
1209 }
1210
u32_to_usize(n: u32) -> usize1211 fn u32_to_usize(n: u32) -> usize {
1212 // In case usize is less than 32 bits, we need to guard against overflow.
1213 // On most platforms this compiles to nothing.
1214 // TODO Use `std::convert::TryFrom` once it's stable.
1215 if (n as u64) > (::std::usize::MAX as u64) {
1216 panic!("BUG: {} is too big to be pointer sized", n)
1217 }
1218 n as usize
1219 }
1220
1221 #[cfg(test)]
1222 mod tests {
1223 use super::ByteClassSet;
1224
1225 #[test]
byte_classes()1226 fn byte_classes() {
1227 let mut set = ByteClassSet::new();
1228 set.set_range(b'a', b'z');
1229 let classes = set.byte_classes();
1230 assert_eq!(classes[0], 0);
1231 assert_eq!(classes[1], 0);
1232 assert_eq!(classes[2], 0);
1233 assert_eq!(classes[b'a' as usize - 1], 0);
1234 assert_eq!(classes[b'a' as usize], 1);
1235 assert_eq!(classes[b'm' as usize], 1);
1236 assert_eq!(classes[b'z' as usize], 1);
1237 assert_eq!(classes[b'z' as usize + 1], 2);
1238 assert_eq!(classes[254], 2);
1239 assert_eq!(classes[255], 2);
1240
1241 let mut set = ByteClassSet::new();
1242 set.set_range(0, 2);
1243 set.set_range(4, 6);
1244 let classes = set.byte_classes();
1245 assert_eq!(classes[0], 0);
1246 assert_eq!(classes[1], 0);
1247 assert_eq!(classes[2], 0);
1248 assert_eq!(classes[3], 1);
1249 assert_eq!(classes[4], 2);
1250 assert_eq!(classes[5], 2);
1251 assert_eq!(classes[6], 2);
1252 assert_eq!(classes[7], 3);
1253 assert_eq!(classes[255], 3);
1254 }
1255
1256 #[test]
full_byte_classes()1257 fn full_byte_classes() {
1258 let mut set = ByteClassSet::new();
1259 for i in 0..256u16 {
1260 set.set_range(i as u8, i as u8);
1261 }
1262 assert_eq!(set.byte_classes().len(), 256);
1263 }
1264 }
1265