1 /// A few elementary UTF-8 encoding and decoding functions used by the matching
2 /// engines.
3 ///
4 /// In an ideal world, the matching engines operate on `&str` and we can just
5 /// lean on the standard library for all our UTF-8 needs. However, to support
6 /// byte based regexes (that can match on arbitrary bytes which may contain
7 /// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8 /// The standard library doesn't really recognize this use case, so we have
9 /// to build it out ourselves.
10 ///
11 /// Should this be factored out into a separate crate? It seems independently
12 /// useful. There are other crates that already exist (e.g., `utf-8`) that have
13 /// overlapping use cases. Not sure what to do.
14 use std::char;
15
16 const TAG_CONT: u8 = 0b1000_0000;
17 const TAG_TWO: u8 = 0b1100_0000;
18 const TAG_THREE: u8 = 0b1110_0000;
19 const TAG_FOUR: u8 = 0b1111_0000;
20
21 /// Returns the smallest possible index of the next valid UTF-8 sequence
22 /// starting after `i`.
next_utf8(text: &[u8], i: usize) -> usize23 pub fn next_utf8(text: &[u8], i: usize) -> usize {
24 let b = match text.get(i) {
25 None => return i + 1,
26 Some(&b) => b,
27 };
28 let inc = if b <= 0x7F {
29 1
30 } else if b <= 0b110_11111 {
31 2
32 } else if b <= 0b1110_1111 {
33 3
34 } else {
35 4
36 };
37 i + inc
38 }
39
40 /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
41 ///
42 /// If no valid UTF-8 sequence could be found, then `None` is returned.
43 /// Otherwise, the decoded codepoint and the number of bytes read is returned.
44 /// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
45 /// 1, 2, 3 or 4.
46 ///
47 /// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
48 /// codepoint that is out of range (surrogate codepoints are out of range) or
49 /// is not the shortest possible UTF-8 sequence for that codepoint.
50 #[inline]
decode_utf8(src: &[u8]) -> Option<(char, usize)>51 pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52 let b0 = match src.get(0) {
53 None => return None,
54 Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55 Some(&b) => b,
56 };
57 match b0 {
58 0b110_00000..=0b110_11111 => {
59 if src.len() < 2 {
60 return None;
61 }
62 let b1 = src[1];
63 if 0b11_000000 & b1 != TAG_CONT {
64 return None;
65 }
66 let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67 match cp {
68 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69 _ => None,
70 }
71 }
72 0b1110_0000..=0b1110_1111 => {
73 if src.len() < 3 {
74 return None;
75 }
76 let (b1, b2) = (src[1], src[2]);
77 if 0b11_000000 & b1 != TAG_CONT {
78 return None;
79 }
80 if 0b11_000000 & b2 != TAG_CONT {
81 return None;
82 }
83 let cp = ((b0 & !TAG_THREE) as u32) << 12
84 | ((b1 & !TAG_CONT) as u32) << 6
85 | ((b2 & !TAG_CONT) as u32);
86 match cp {
87 // char::from_u32 will disallow surrogate codepoints.
88 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89 _ => None,
90 }
91 }
92 0b11110_000..=0b11110_111 => {
93 if src.len() < 4 {
94 return None;
95 }
96 let (b1, b2, b3) = (src[1], src[2], src[3]);
97 if 0b11_000000 & b1 != TAG_CONT {
98 return None;
99 }
100 if 0b11_000000 & b2 != TAG_CONT {
101 return None;
102 }
103 if 0b11_000000 & b3 != TAG_CONT {
104 return None;
105 }
106 let cp = ((b0 & !TAG_FOUR) as u32) << 18
107 | ((b1 & !TAG_CONT) as u32) << 12
108 | ((b2 & !TAG_CONT) as u32) << 6
109 | ((b3 & !TAG_CONT) as u32);
110 match cp {
111 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112 _ => None,
113 }
114 }
115 _ => None,
116 }
117 }
118
119 /// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
120 /// of the first.
decode_last_utf8(src: &[u8]) -> Option<(char, usize)>121 pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
122 if src.is_empty() {
123 return None;
124 }
125 let mut start = src.len() - 1;
126 if src[start] <= 0x7F {
127 return Some((src[start] as char, 1));
128 }
129 while start > src.len().saturating_sub(4) {
130 start -= 1;
131 if is_start_byte(src[start]) {
132 break;
133 }
134 }
135 match decode_utf8(&src[start..]) {
136 None => None,
137 Some((_, n)) if n < src.len() - start => None,
138 Some((cp, n)) => Some((cp, n)),
139 }
140 }
141
is_start_byte(b: u8) -> bool142 fn is_start_byte(b: u8) -> bool {
143 b & 0b11_000000 != 0b1_0000000
144 }
145
146 #[cfg(test)]
147 mod tests {
148 use std::str;
149
150 use quickcheck::quickcheck;
151
152 use super::{
153 decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
154 };
155
156 #[test]
prop_roundtrip()157 fn prop_roundtrip() {
158 fn p(given_cp: char) -> bool {
159 let mut tmp = [0; 4];
160 let encoded_len = given_cp.encode_utf8(&mut tmp).len();
161 let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
162 encoded_len == got_len && given_cp == got_cp
163 }
164 quickcheck(p as fn(char) -> bool)
165 }
166
167 #[test]
prop_roundtrip_last()168 fn prop_roundtrip_last() {
169 fn p(given_cp: char) -> bool {
170 let mut tmp = [0; 4];
171 let encoded_len = given_cp.encode_utf8(&mut tmp).len();
172 let (got_cp, got_len) =
173 decode_last_utf8(&tmp[..encoded_len]).unwrap();
174 encoded_len == got_len && given_cp == got_cp
175 }
176 quickcheck(p as fn(char) -> bool)
177 }
178
179 #[test]
prop_encode_matches_std()180 fn prop_encode_matches_std() {
181 fn p(cp: char) -> bool {
182 let mut got = [0; 4];
183 let n = cp.encode_utf8(&mut got).len();
184 let expected = cp.to_string();
185 &got[..n] == expected.as_bytes()
186 }
187 quickcheck(p as fn(char) -> bool)
188 }
189
190 #[test]
prop_decode_matches_std()191 fn prop_decode_matches_std() {
192 fn p(given_cp: char) -> bool {
193 let mut tmp = [0; 4];
194 let n = given_cp.encode_utf8(&mut tmp).len();
195 let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
196 let expected_cp =
197 str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
198 got_cp == expected_cp
199 }
200 quickcheck(p as fn(char) -> bool)
201 }
202
203 #[test]
prop_decode_last_matches_std()204 fn prop_decode_last_matches_std() {
205 fn p(given_cp: char) -> bool {
206 let mut tmp = [0; 4];
207 let n = given_cp.encode_utf8(&mut tmp).len();
208 let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
209 let expected_cp = str::from_utf8(&tmp[..n])
210 .unwrap()
211 .chars()
212 .rev()
213 .next()
214 .unwrap();
215 got_cp == expected_cp
216 }
217 quickcheck(p as fn(char) -> bool)
218 }
219
220 #[test]
reject_invalid()221 fn reject_invalid() {
222 // Invalid start byte
223 assert_eq!(decode_utf8(&[0xFF]), None);
224 // Surrogate pair
225 assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
226 // Invalid continuation byte.
227 assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
228 // Bad lengths
229 assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
230 assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
231 assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
232 // Not a minimal UTF-8 sequence
233 assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
234 assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
235 assert_eq!(
236 decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
237 None
238 );
239 }
240
241 #[test]
reject_invalid_last()242 fn reject_invalid_last() {
243 // Invalid start byte
244 assert_eq!(decode_last_utf8(&[0xFF]), None);
245 // Surrogate pair
246 assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
247 // Bad lengths
248 assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
249 assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
250 assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
251 // Not a minimal UTF-8 sequence
252 assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
253 assert_eq!(
254 decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
255 None
256 );
257 assert_eq!(
258 decode_last_utf8(
259 &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
260 ),
261 None
262 );
263 }
264 }
265