1 /*!
2 Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4 This is sub-module is useful for constructing byte based automatons that need
5 to embed UTF-8 decoding. The most common use of this module is in conjunction
6 with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
7
8 See the documentation on the `Utf8Sequences` iterator for more details and
9 an example.
10
11 # Wait, what is this?
12
13 This is simplest to explain with an example. Let's say you wanted to test
14 whether a particular byte sequence was a Cyrillic character. One possible
15 scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16 range can be expressed as a sequence of byte ranges:
17
18 ```text
19 [D0-D3][80-BF]
20 ```
21
22 This is simple enough: simply encode the boundaries, `0400` encodes to
23 `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24 corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26 However, what if you wanted to add the Cyrillic Supplementary characters to
27 your range? Your range might then become `[0400-052F]`. The same procedure
28 as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29 you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30 this isn't quite correct because this range doesn't capture many characters,
31 for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33 Instead, you need multiple sequences of byte ranges:
34
35 ```text
36 [D0-D3][80-BF] # matches codepoints 0400-04FF
37 [D4][80-AF] # matches codepoints 0500-052F
38 ```
39
40 This gets even more complicated if you want bigger ranges, particularly if
41 they naively contain surrogate codepoints. For example, the sequence of byte
42 ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44 ```text
45 [0-7F]
46 [C2-DF][80-BF]
47 [E0][A0-BF][80-BF]
48 [E1-EC][80-BF][80-BF]
49 [ED][80-9F][80-BF]
50 [EE-EF][80-BF][80-BF]
51 ```
52
53 Note that the byte ranges above will *not* match any erroneous encoding of
54 UTF-8, including encodings of surrogate codepoints.
55
56 And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58 ```text
59 [0-7F]
60 [C2-DF][80-BF]
61 [E0][A0-BF][80-BF]
62 [E1-EC][80-BF][80-BF]
63 [ED][80-9F][80-BF]
64 [EE-EF][80-BF][80-BF]
65 [F0][90-BF][80-BF][80-BF]
66 [F1-F3][80-BF][80-BF][80-BF]
67 [F4][80-8F][80-BF][80-BF]
68 ```
69
70 This module automates the process of creating these byte ranges from ranges of
71 Unicode scalar values.
72
73 # Lineage
74
75 I got the idea and general implementation strategy from Russ Cox in his
76 [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77 Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78 I also got the idea from
79 [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80 which uses it for executing automata on their term index.
81 */
82
83 #![deny(missing_docs)]
84
85 use std::char;
86 use std::fmt;
87 use std::iter::FusedIterator;
88 use std::slice;
89
90 const MAX_UTF8_BYTES: usize = 4;
91
92 /// Utf8Sequence represents a sequence of byte ranges.
93 ///
94 /// To match a Utf8Sequence, a candidate byte sequence must match each
95 /// successive range.
96 ///
97 /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
98 /// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
99 #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
100 pub enum Utf8Sequence {
101 /// One byte range.
102 One(Utf8Range),
103 /// Two successive byte ranges.
104 Two([Utf8Range; 2]),
105 /// Three successive byte ranges.
106 Three([Utf8Range; 3]),
107 /// Four successive byte ranges.
108 Four([Utf8Range; 4]),
109 }
110
111 impl Utf8Sequence {
112 /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
113 /// range.
114 ///
115 /// This assumes that `start` and `end` have the same length.
from_encoded_range(start: &[u8], end: &[u8]) -> Self116 fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
117 assert_eq!(start.len(), end.len());
118 match start.len() {
119 2 => Utf8Sequence::Two([
120 Utf8Range::new(start[0], end[0]),
121 Utf8Range::new(start[1], end[1]),
122 ]),
123 3 => Utf8Sequence::Three([
124 Utf8Range::new(start[0], end[0]),
125 Utf8Range::new(start[1], end[1]),
126 Utf8Range::new(start[2], end[2]),
127 ]),
128 4 => Utf8Sequence::Four([
129 Utf8Range::new(start[0], end[0]),
130 Utf8Range::new(start[1], end[1]),
131 Utf8Range::new(start[2], end[2]),
132 Utf8Range::new(start[3], end[3]),
133 ]),
134 n => unreachable!("invalid encoded length: {}", n),
135 }
136 }
137
138 /// Returns the underlying sequence of byte ranges as a slice.
as_slice(&self) -> &[Utf8Range]139 pub fn as_slice(&self) -> &[Utf8Range] {
140 use self::Utf8Sequence::*;
141 match *self {
142 One(ref r) => slice::from_ref(r),
143 Two(ref r) => &r[..],
144 Three(ref r) => &r[..],
145 Four(ref r) => &r[..],
146 }
147 }
148
149 /// Returns the number of byte ranges in this sequence.
150 ///
151 /// The length is guaranteed to be in the closed interval `[1, 4]`.
len(&self) -> usize152 pub fn len(&self) -> usize {
153 self.as_slice().len()
154 }
155
156 /// Reverses the ranges in this sequence.
157 ///
158 /// For example, if this corresponds to the following sequence:
159 ///
160 /// ```text
161 /// [D0-D3][80-BF]
162 /// ```
163 ///
164 /// Then after reversal, it will be
165 ///
166 /// ```text
167 /// [80-BF][D0-D3]
168 /// ```
169 ///
170 /// This is useful when one is constructing a UTF-8 automaton to match
171 /// character classes in reverse.
reverse(&mut self)172 pub fn reverse(&mut self) {
173 match *self {
174 Utf8Sequence::One(_) => {}
175 Utf8Sequence::Two(ref mut x) => x.reverse(),
176 Utf8Sequence::Three(ref mut x) => x.reverse(),
177 Utf8Sequence::Four(ref mut x) => x.reverse(),
178 }
179 }
180
181 /// Returns true if and only if a prefix of `bytes` matches this sequence
182 /// of byte ranges.
matches(&self, bytes: &[u8]) -> bool183 pub fn matches(&self, bytes: &[u8]) -> bool {
184 if bytes.len() < self.len() {
185 return false;
186 }
187 for (&b, r) in bytes.iter().zip(self) {
188 if !r.matches(b) {
189 return false;
190 }
191 }
192 true
193 }
194 }
195
196 impl<'a> IntoIterator for &'a Utf8Sequence {
197 type IntoIter = slice::Iter<'a, Utf8Range>;
198 type Item = &'a Utf8Range;
199
into_iter(self) -> Self::IntoIter200 fn into_iter(self) -> Self::IntoIter {
201 self.as_slice().iter()
202 }
203 }
204
205 impl fmt::Debug for Utf8Sequence {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result206 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207 use self::Utf8Sequence::*;
208 match *self {
209 One(ref r) => write!(f, "{:?}", r),
210 Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
211 Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
212 Four(ref r) => {
213 write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
214 }
215 }
216 }
217 }
218
219 /// A single inclusive range of UTF-8 bytes.
220 #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
221 pub struct Utf8Range {
222 /// Start of byte range (inclusive).
223 pub start: u8,
224 /// End of byte range (inclusive).
225 pub end: u8,
226 }
227
228 impl Utf8Range {
new(start: u8, end: u8) -> Self229 fn new(start: u8, end: u8) -> Self {
230 Utf8Range { start, end }
231 }
232
233 /// Returns true if and only if the given byte is in this range.
matches(&self, b: u8) -> bool234 pub fn matches(&self, b: u8) -> bool {
235 self.start <= b && b <= self.end
236 }
237 }
238
239 impl fmt::Debug for Utf8Range {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result240 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241 if self.start == self.end {
242 write!(f, "[{:X}]", self.start)
243 } else {
244 write!(f, "[{:X}-{:X}]", self.start, self.end)
245 }
246 }
247 }
248
249 /// An iterator over ranges of matching UTF-8 byte sequences.
250 ///
251 /// The iteration represents an alternation of comprehensive byte sequences
252 /// that match precisely the set of UTF-8 encoded scalar values.
253 ///
254 /// A byte sequence corresponds to one of the scalar values in the range given
255 /// if and only if it completely matches exactly one of the sequences of byte
256 /// ranges produced by this iterator.
257 ///
258 /// Each sequence of byte ranges matches a unique set of bytes. That is, no two
259 /// sequences will match the same bytes.
260 ///
261 /// # Example
262 ///
263 /// This shows how to match an arbitrary byte sequence against a range of
264 /// scalar values.
265 ///
266 /// ```rust
267 /// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
268 ///
269 /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
270 /// for range in seqs {
271 /// if range.matches(bytes) {
272 /// return true;
273 /// }
274 /// }
275 /// false
276 /// }
277 ///
278 /// // Test the basic multilingual plane.
279 /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
280 ///
281 /// // UTF-8 encoding of 'a'.
282 /// assert!(matches(&seqs, &[0x61]));
283 /// // UTF-8 encoding of '☃' (`\u{2603}`).
284 /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
285 /// // UTF-8 encoding of `\u{10348}` (outside the BMP).
286 /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
287 /// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
288 /// // which is invalid UTF-8, and therefore fails, despite the fact that
289 /// // the corresponding codepoint (0xD800) falls in the range given.
290 /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
291 /// // And fails against plain old invalid UTF-8.
292 /// assert!(!matches(&seqs, &[0xFF, 0xFF]));
293 /// ```
294 ///
295 /// If this example seems circuitous, that's because it is! It's meant to be
296 /// illustrative. In practice, you could just try to decode your byte sequence
297 /// and compare it with the scalar value range directly. However, this is not
298 /// always possible (for example, in a byte based automaton).
299 #[derive(Debug)]
300 pub struct Utf8Sequences {
301 range_stack: Vec<ScalarRange>,
302 }
303
304 impl Utf8Sequences {
305 /// Create a new iterator over UTF-8 byte ranges for the scalar value range
306 /// given.
new(start: char, end: char) -> Self307 pub fn new(start: char, end: char) -> Self {
308 let mut it = Utf8Sequences { range_stack: vec![] };
309 it.push(start as u32, end as u32);
310 it
311 }
312
313 /// reset resets the scalar value range.
314 /// Any existing state is cleared, but resources may be reused.
315 ///
316 /// N.B. Benchmarks say that this method is dubious.
317 #[doc(hidden)]
reset(&mut self, start: char, end: char)318 pub fn reset(&mut self, start: char, end: char) {
319 self.range_stack.clear();
320 self.push(start as u32, end as u32);
321 }
322
push(&mut self, start: u32, end: u32)323 fn push(&mut self, start: u32, end: u32) {
324 self.range_stack.push(ScalarRange { start, end });
325 }
326 }
327
328 struct ScalarRange {
329 start: u32,
330 end: u32,
331 }
332
333 impl fmt::Debug for ScalarRange {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result334 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
335 write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
336 }
337 }
338
339 impl Iterator for Utf8Sequences {
340 type Item = Utf8Sequence;
341
next(&mut self) -> Option<Self::Item>342 fn next(&mut self) -> Option<Self::Item> {
343 'TOP: while let Some(mut r) = self.range_stack.pop() {
344 'INNER: loop {
345 if let Some((r1, r2)) = r.split() {
346 self.push(r2.start, r2.end);
347 r.start = r1.start;
348 r.end = r1.end;
349 continue 'INNER;
350 }
351 if !r.is_valid() {
352 continue 'TOP;
353 }
354 for i in 1..MAX_UTF8_BYTES {
355 let max = max_scalar_value(i);
356 if r.start <= max && max < r.end {
357 self.push(max + 1, r.end);
358 r.end = max;
359 continue 'INNER;
360 }
361 }
362 if let Some(ascii_range) = r.as_ascii() {
363 return Some(Utf8Sequence::One(ascii_range));
364 }
365 for i in 1..MAX_UTF8_BYTES {
366 let m = (1 << (6 * i)) - 1;
367 if (r.start & !m) != (r.end & !m) {
368 if (r.start & m) != 0 {
369 self.push((r.start | m) + 1, r.end);
370 r.end = r.start | m;
371 continue 'INNER;
372 }
373 if (r.end & m) != m {
374 self.push(r.end & !m, r.end);
375 r.end = (r.end & !m) - 1;
376 continue 'INNER;
377 }
378 }
379 }
380 let mut start = [0; MAX_UTF8_BYTES];
381 let mut end = [0; MAX_UTF8_BYTES];
382 let n = r.encode(&mut start, &mut end);
383 return Some(Utf8Sequence::from_encoded_range(
384 &start[0..n],
385 &end[0..n],
386 ));
387 }
388 }
389 None
390 }
391 }
392
393 impl FusedIterator for Utf8Sequences {}
394
395 impl ScalarRange {
396 /// split splits this range if it overlaps with a surrogate codepoint.
397 ///
398 /// Either or both ranges may be invalid.
split(&self) -> Option<(ScalarRange, ScalarRange)>399 fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
400 if self.start < 0xE000 && self.end > 0xD7FF {
401 Some((
402 ScalarRange { start: self.start, end: 0xD7FF },
403 ScalarRange { start: 0xE000, end: self.end },
404 ))
405 } else {
406 None
407 }
408 }
409
410 /// is_valid returns true if and only if start <= end.
is_valid(&self) -> bool411 fn is_valid(&self) -> bool {
412 self.start <= self.end
413 }
414
415 /// as_ascii returns this range as a Utf8Range if and only if all scalar
416 /// values in this range can be encoded as a single byte.
as_ascii(&self) -> Option<Utf8Range>417 fn as_ascii(&self) -> Option<Utf8Range> {
418 if self.is_ascii() {
419 Some(Utf8Range::new(self.start as u8, self.end as u8))
420 } else {
421 None
422 }
423 }
424
425 /// is_ascii returns true if the range is ASCII only (i.e., takes a single
426 /// byte to encode any scalar value).
is_ascii(&self) -> bool427 fn is_ascii(&self) -> bool {
428 self.is_valid() && self.end <= 0x7f
429 }
430
431 /// encode writes the UTF-8 encoding of the start and end of this range
432 /// to the corresponding destination slices, and returns the number of
433 /// bytes written.
434 ///
435 /// The slices should have room for at least `MAX_UTF8_BYTES`.
encode(&self, start: &mut [u8], end: &mut [u8]) -> usize436 fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
437 let cs = char::from_u32(self.start).unwrap();
438 let ce = char::from_u32(self.end).unwrap();
439 let ss = cs.encode_utf8(start);
440 let se = ce.encode_utf8(end);
441 assert_eq!(ss.len(), se.len());
442 ss.len()
443 }
444 }
445
max_scalar_value(nbytes: usize) -> u32446 fn max_scalar_value(nbytes: usize) -> u32 {
447 match nbytes {
448 1 => 0x007F,
449 2 => 0x07FF,
450 3 => 0xFFFF,
451 4 => 0x0010_FFFF,
452 _ => unreachable!("invalid UTF-8 byte sequence size"),
453 }
454 }
455
456 #[cfg(test)]
457 mod tests {
458 use std::char;
459
460 use crate::utf8::{Utf8Range, Utf8Sequences};
461
rutf8(s: u8, e: u8) -> Utf8Range462 fn rutf8(s: u8, e: u8) -> Utf8Range {
463 Utf8Range::new(s, e)
464 }
465
never_accepts_surrogate_codepoints(start: char, end: char)466 fn never_accepts_surrogate_codepoints(start: char, end: char) {
467 for cp in 0xD800..0xE000 {
468 let buf = encode_surrogate(cp);
469 for r in Utf8Sequences::new(start, end) {
470 if r.matches(&buf) {
471 panic!(
472 "Sequence ({:X}, {:X}) contains range {:?}, \
473 which matches surrogate code point {:X} \
474 with encoded bytes {:?}",
475 start as u32, end as u32, r, cp, buf,
476 );
477 }
478 }
479 }
480 }
481
482 #[test]
codepoints_no_surrogates()483 fn codepoints_no_surrogates() {
484 never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
485 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
486 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
487 never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
488 never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
489 }
490
491 #[test]
single_codepoint_one_sequence()492 fn single_codepoint_one_sequence() {
493 // Tests that every range of scalar values that contains a single
494 // scalar value is recognized by one sequence of byte ranges.
495 for i in 0x0..=0x0010_FFFF {
496 let c = match char::from_u32(i) {
497 None => continue,
498 Some(c) => c,
499 };
500 let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
501 assert_eq!(seqs.len(), 1);
502 }
503 }
504
505 #[test]
bmp()506 fn bmp() {
507 use crate::utf8::Utf8Sequence::*;
508
509 let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
510 assert_eq!(
511 seqs,
512 vec![
513 One(rutf8(0x0, 0x7F)),
514 Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
515 Three([
516 rutf8(0xE0, 0xE0),
517 rutf8(0xA0, 0xBF),
518 rutf8(0x80, 0xBF)
519 ]),
520 Three([
521 rutf8(0xE1, 0xEC),
522 rutf8(0x80, 0xBF),
523 rutf8(0x80, 0xBF)
524 ]),
525 Three([
526 rutf8(0xED, 0xED),
527 rutf8(0x80, 0x9F),
528 rutf8(0x80, 0xBF)
529 ]),
530 Three([
531 rutf8(0xEE, 0xEF),
532 rutf8(0x80, 0xBF),
533 rutf8(0x80, 0xBF)
534 ]),
535 ]
536 );
537 }
538
539 #[test]
reverse()540 fn reverse() {
541 use crate::utf8::Utf8Sequence::*;
542
543 let mut s = One(rutf8(0xA, 0xB));
544 s.reverse();
545 assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
546
547 let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
548 s.reverse();
549 assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
550
551 let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
552 s.reverse();
553 assert_eq!(
554 s.as_slice(),
555 &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
556 );
557
558 let mut s = Four([
559 rutf8(0xA, 0xB),
560 rutf8(0xB, 0xC),
561 rutf8(0xC, 0xD),
562 rutf8(0xD, 0xE),
563 ]);
564 s.reverse();
565 assert_eq!(
566 s.as_slice(),
567 &[
568 rutf8(0xD, 0xE),
569 rutf8(0xC, 0xD),
570 rutf8(0xB, 0xC),
571 rutf8(0xA, 0xB)
572 ]
573 );
574 }
575
encode_surrogate(cp: u32) -> [u8; 3]576 fn encode_surrogate(cp: u32) -> [u8; 3] {
577 const TAG_CONT: u8 = 0b1000_0000;
578 const TAG_THREE_B: u8 = 0b1110_0000;
579
580 assert!(0xD800 <= cp && cp < 0xE000);
581 let mut dst = [0; 3];
582 dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
583 dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
584 dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
585 dst
586 }
587 }
588