1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 //! Lossy UTF-8 processing utilities.
9 #![deny(unsafe_op_in_unsafe_fn)]
10
11 // TODO: Replace this with the `std` versions once stable.
12 // This is adapted from https://github.com/rust-lang/rust/blob/e8ee0b7/library/core/src/str/lossy.rs
13 // The adaptations:
14 // - remove `#[unstable]` attributes.
15 // - replace `crate`/`super` paths with their `std` equivalents in code and
16 // examples.
17 // - include `UTF8_CHAR_WIDTH`/`utf8_char_width` from `core::str::validations`.
18 // - use a custom `split_at_unchecked` instead of the nightly one
19
20 use std::fmt;
21 use std::fmt::Formatter;
22 use std::fmt::Write;
23 use std::iter::FusedIterator;
24 use std::str::from_utf8_unchecked;
25
26 /// An item returned by the [`Utf8Chunks`] iterator.
27 ///
28 /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
29 /// when decoding a UTF-8 string.
30 ///
31 /// # Examples
32 ///
33 /// ```ignore
34 /// use googletest::prelude::*;
35 /// use utf8::Utf8Chunks;
36 ///
37 /// // An invalid UTF-8 string
38 /// let bytes = b"foo\xF1\x80bar";
39 ///
40 /// // Decode the first `Utf8Chunk`
41 /// let chunk = Utf8Chunks::new(bytes).next().unwrap();
42 ///
43 /// // The first three characters are valid UTF-8
44 /// assert_that!("foo", eq(chunk.valid()));
45 ///
46 /// // The fourth character is broken
47 /// assert_that!(b"\xF1\x80", eq(chunk.invalid()));
48 /// ```
49 #[derive(Clone, Debug, PartialEq, Eq)]
50 pub struct Utf8Chunk<'a> {
51 valid: &'a str,
52 invalid: &'a [u8],
53 }
54
55 impl<'a> Utf8Chunk<'a> {
56 /// Returns the next validated UTF-8 substring.
57 ///
58 /// This substring can be empty at the start of the string or between
59 /// broken UTF-8 characters.
60 #[must_use]
valid(&self) -> &'a str61 pub fn valid(&self) -> &'a str {
62 self.valid
63 }
64
65 /// Returns the invalid sequence that caused a failure.
66 ///
67 /// The returned slice will have a maximum length of 3 and starts after the
68 /// substring given by [`valid`]. Decoding will resume after this sequence.
69 ///
70 /// If empty, this is the last chunk in the string. If non-empty, an
71 /// unexpected byte was encountered or the end of the input was reached
72 /// unexpectedly.
73 ///
74 /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
75 /// CHARACTER`].
76 ///
77 /// [`valid`]: Self::valid
78 /// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
79 #[must_use]
invalid(&self) -> &'a [u8]80 pub fn invalid(&self) -> &'a [u8] {
81 self.invalid
82 }
83 }
84
85 #[must_use]
86 pub struct Debug<'a>(&'a [u8]);
87
88 impl fmt::Debug for Debug<'_> {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result89 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
90 f.write_char('"')?;
91
92 for chunk in Utf8Chunks::new(self.0) {
93 // Valid part.
94 // Here we partially parse UTF-8 again which is suboptimal.
95 {
96 let valid = chunk.valid();
97 let mut from = 0;
98 for (i, c) in valid.char_indices() {
99 let esc = c.escape_debug();
100 // If char needs escaping, flush backlog so far and write, else skip
101 if esc.len() != 1 {
102 f.write_str(&valid[from..i])?;
103 for c in esc {
104 f.write_char(c)?;
105 }
106 from = i + c.len_utf8();
107 }
108 }
109 f.write_str(&valid[from..])?;
110 }
111
112 // Broken parts of string as hex escape.
113 for &b in chunk.invalid() {
114 write!(f, "\\x{:02X}", b)?;
115 }
116 }
117
118 f.write_char('"')
119 }
120 }
121
122 /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
123 /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
124 ///
125 /// If you want a simple conversion from UTF-8 byte slices to string slices,
126 /// [`from_utf8`] is easier to use.
127 ///
128 /// [byteslice]: slice
129 /// [`from_utf8`]: std::str::from_utf8
130 ///
131 /// # Examples
132 ///
133 /// This can be used to create functionality similar to
134 /// [`String::from_utf8_lossy`] without allocating heap memory:
135 ///
136 /// ```ignore
137 /// use utf8::Utf8Chunks;
138 ///
139 /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
140 /// for chunk in Utf8Chunks::new(input) {
141 /// push(chunk.valid());
142 ///
143 /// if !chunk.invalid().is_empty() {
144 /// push("\u{FFFD}");
145 /// }
146 /// }
147 /// }
148 /// ```
149 #[must_use = "iterators are lazy and do nothing unless consumed"]
150 #[derive(Clone)]
151 pub struct Utf8Chunks<'a> {
152 source: &'a [u8],
153 }
154
155 impl<'a> Utf8Chunks<'a> {
156 /// Creates a new iterator to decode the bytes.
new(bytes: &'a [u8]) -> Self157 pub fn new(bytes: &'a [u8]) -> Self {
158 Self { source: bytes }
159 }
160
161 #[doc(hidden)]
debug(&self) -> Debug<'_>162 pub fn debug(&self) -> Debug<'_> {
163 Debug(self.source)
164 }
165 }
166
167 impl<'a> Iterator for Utf8Chunks<'a> {
168 type Item = Utf8Chunk<'a>;
169
next(&mut self) -> Option<Utf8Chunk<'a>>170 fn next(&mut self) -> Option<Utf8Chunk<'a>> {
171 if self.source.is_empty() {
172 return None;
173 }
174
175 const TAG_CONT_U8: u8 = 128;
176 fn safe_get(xs: &[u8], i: usize) -> u8 {
177 *xs.get(i).unwrap_or(&0)
178 }
179
180 let mut i = 0;
181 let mut valid_up_to = 0;
182 while i < self.source.len() {
183 // SAFETY: `i < self.source.len()` per previous line.
184 // For some reason the following are both significantly slower:
185 // while let Some(&byte) = self.source.get(i) {
186 // while let Some(byte) = self.source.get(i).copied() {
187 let byte = unsafe { *self.source.get_unchecked(i) };
188 i += 1;
189
190 if byte < 128 {
191 // This could be a `1 => ...` case in the match below, but for
192 // the common case of all-ASCII inputs, we bypass loading the
193 // sizeable UTF8_CHAR_WIDTH table into cache.
194 } else {
195 let w = utf8_char_width(byte);
196
197 match w {
198 2 => {
199 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
200 break;
201 }
202 i += 1;
203 }
204 3 => {
205 match (byte, safe_get(self.source, i)) {
206 (0xE0, 0xA0..=0xBF) => (),
207 (0xE1..=0xEC, 0x80..=0xBF) => (),
208 (0xED, 0x80..=0x9F) => (),
209 (0xEE..=0xEF, 0x80..=0xBF) => (),
210 _ => break,
211 }
212 i += 1;
213 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
214 break;
215 }
216 i += 1;
217 }
218 4 => {
219 match (byte, safe_get(self.source, i)) {
220 (0xF0, 0x90..=0xBF) => (),
221 (0xF1..=0xF3, 0x80..=0xBF) => (),
222 (0xF4, 0x80..=0x8F) => (),
223 _ => break,
224 }
225 i += 1;
226 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
227 break;
228 }
229 i += 1;
230 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
231 break;
232 }
233 i += 1;
234 }
235 _ => break,
236 }
237 }
238
239 valid_up_to = i;
240 }
241
242 /// # Safety
243 /// `index` must be in-bounds for `x`
244 unsafe fn split_at_unchecked(x: &[u8], index: usize) -> (&[u8], &[u8]) {
245 // SAFETY: in-bounds as promised by the caller
246 unsafe { (x.get_unchecked(..index), x.get_unchecked(index..)) }
247 }
248
249 // SAFETY: `i <= self.source.len()` because it is only ever incremented
250 // via `i += 1` and in between every single one of those increments, `i`
251 // is compared against `self.source.len()`. That happens either
252 // literally by `i < self.source.len()` in the while-loop's condition,
253 // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
254 // loop is terminated as soon as the latest `i += 1` has made `i` no
255 // longer less than `self.source.len()`, which means it'll be at most
256 // equal to `self.source.len()`.
257 let (inspected, remaining) = unsafe { split_at_unchecked(self.source, i) };
258 self.source = remaining;
259
260 // SAFETY: `valid_up_to <= i` because it is only ever assigned via
261 // `valid_up_to = i` and `i` only increases.
262 let (valid, invalid) = unsafe { split_at_unchecked(inspected, valid_up_to) };
263
264 Some(Utf8Chunk {
265 // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
266 valid: unsafe { from_utf8_unchecked(valid) },
267 invalid,
268 })
269 }
270 }
271
272 impl FusedIterator for Utf8Chunks<'_> {}
273
274 impl fmt::Debug for Utf8Chunks<'_> {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result275 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
276 f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
277 }
278 }
279
280 // https://tools.ietf.org/html/rfc3629
281 const UTF8_CHAR_WIDTH: &[u8; 256] = &[
282 // 1 2 3 4 5 6 7 8 9 A B C D E F
283 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
288 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
289 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
290 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
291 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
295 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
296 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
297 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
298 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
299 ];
300
301 /// Given a first byte, determines how many bytes are in this UTF-8 character.
302 #[must_use]
303 #[inline]
utf8_char_width(b: u8) -> usize304 const fn utf8_char_width(b: u8) -> usize {
305 UTF8_CHAR_WIDTH[b as usize] as usize
306 }
307