• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 //! Lossy UTF-8 processing utilities.
9 #![deny(unsafe_op_in_unsafe_fn)]
10 
11 // TODO: Replace this with the `std` versions once stable.
12 // This is adapted from https://github.com/rust-lang/rust/blob/e8ee0b7/library/core/src/str/lossy.rs
13 // The adaptations:
14 // - remove `#[unstable]` attributes.
15 // - replace `crate`/`super` paths with their `std` equivalents in code and
16 //   examples.
17 // - include `UTF8_CHAR_WIDTH`/`utf8_char_width` from `core::str::validations`.
18 // - use a custom `split_at_unchecked` instead of the nightly one
19 
20 use std::fmt;
21 use std::fmt::Formatter;
22 use std::fmt::Write;
23 use std::iter::FusedIterator;
24 use std::str::from_utf8_unchecked;
25 
26 /// An item returned by the [`Utf8Chunks`] iterator.
27 ///
28 /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
29 /// when decoding a UTF-8 string.
30 ///
31 /// # Examples
32 ///
33 /// ```ignore
34 /// use googletest::prelude::*;
35 /// use utf8::Utf8Chunks;
36 ///
37 /// // An invalid UTF-8 string
38 /// let bytes = b"foo\xF1\x80bar";
39 ///
40 /// // Decode the first `Utf8Chunk`
41 /// let chunk = Utf8Chunks::new(bytes).next().unwrap();
42 ///
43 /// // The first three characters are valid UTF-8
44 /// assert_that!("foo", eq(chunk.valid()));
45 ///
46 /// // The fourth character is broken
47 /// assert_that!(b"\xF1\x80", eq(chunk.invalid()));
48 /// ```
49 #[derive(Clone, Debug, PartialEq, Eq)]
50 pub struct Utf8Chunk<'a> {
51     valid: &'a str,
52     invalid: &'a [u8],
53 }
54 
55 impl<'a> Utf8Chunk<'a> {
56     /// Returns the next validated UTF-8 substring.
57     ///
58     /// This substring can be empty at the start of the string or between
59     /// broken UTF-8 characters.
60     #[must_use]
valid(&self) -> &'a str61     pub fn valid(&self) -> &'a str {
62         self.valid
63     }
64 
65     /// Returns the invalid sequence that caused a failure.
66     ///
67     /// The returned slice will have a maximum length of 3 and starts after the
68     /// substring given by [`valid`]. Decoding will resume after this sequence.
69     ///
70     /// If empty, this is the last chunk in the string. If non-empty, an
71     /// unexpected byte was encountered or the end of the input was reached
72     /// unexpectedly.
73     ///
74     /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
75     /// CHARACTER`].
76     ///
77     /// [`valid`]: Self::valid
78     /// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
79     #[must_use]
invalid(&self) -> &'a [u8]80     pub fn invalid(&self) -> &'a [u8] {
81         self.invalid
82     }
83 }
84 
85 #[must_use]
86 pub struct Debug<'a>(&'a [u8]);
87 
88 impl fmt::Debug for Debug<'_> {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result89     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
90         f.write_char('"')?;
91 
92         for chunk in Utf8Chunks::new(self.0) {
93             // Valid part.
94             // Here we partially parse UTF-8 again which is suboptimal.
95             {
96                 let valid = chunk.valid();
97                 let mut from = 0;
98                 for (i, c) in valid.char_indices() {
99                     let esc = c.escape_debug();
100                     // If char needs escaping, flush backlog so far and write, else skip
101                     if esc.len() != 1 {
102                         f.write_str(&valid[from..i])?;
103                         for c in esc {
104                             f.write_char(c)?;
105                         }
106                         from = i + c.len_utf8();
107                     }
108                 }
109                 f.write_str(&valid[from..])?;
110             }
111 
112             // Broken parts of string as hex escape.
113             for &b in chunk.invalid() {
114                 write!(f, "\\x{:02X}", b)?;
115             }
116         }
117 
118         f.write_char('"')
119     }
120 }
121 
122 /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
123 /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
124 ///
125 /// If you want a simple conversion from UTF-8 byte slices to string slices,
126 /// [`from_utf8`] is easier to use.
127 ///
128 /// [byteslice]: slice
129 /// [`from_utf8`]: std::str::from_utf8
130 ///
131 /// # Examples
132 ///
133 /// This can be used to create functionality similar to
134 /// [`String::from_utf8_lossy`] without allocating heap memory:
135 ///
136 /// ```ignore
137 /// use utf8::Utf8Chunks;
138 ///
139 /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
140 ///     for chunk in Utf8Chunks::new(input) {
141 ///         push(chunk.valid());
142 ///
143 ///         if !chunk.invalid().is_empty() {
144 ///             push("\u{FFFD}");
145 ///         }
146 ///     }
147 /// }
148 /// ```
149 #[must_use = "iterators are lazy and do nothing unless consumed"]
150 #[derive(Clone)]
151 pub struct Utf8Chunks<'a> {
152     source: &'a [u8],
153 }
154 
155 impl<'a> Utf8Chunks<'a> {
156     /// Creates a new iterator to decode the bytes.
new(bytes: &'a [u8]) -> Self157     pub fn new(bytes: &'a [u8]) -> Self {
158         Self { source: bytes }
159     }
160 
161     #[doc(hidden)]
debug(&self) -> Debug<'_>162     pub fn debug(&self) -> Debug<'_> {
163         Debug(self.source)
164     }
165 }
166 
167 impl<'a> Iterator for Utf8Chunks<'a> {
168     type Item = Utf8Chunk<'a>;
169 
next(&mut self) -> Option<Utf8Chunk<'a>>170     fn next(&mut self) -> Option<Utf8Chunk<'a>> {
171         if self.source.is_empty() {
172             return None;
173         }
174 
175         const TAG_CONT_U8: u8 = 128;
176         fn safe_get(xs: &[u8], i: usize) -> u8 {
177             *xs.get(i).unwrap_or(&0)
178         }
179 
180         let mut i = 0;
181         let mut valid_up_to = 0;
182         while i < self.source.len() {
183             // SAFETY: `i < self.source.len()` per previous line.
184             // For some reason the following are both significantly slower:
185             // while let Some(&byte) = self.source.get(i) {
186             // while let Some(byte) = self.source.get(i).copied() {
187             let byte = unsafe { *self.source.get_unchecked(i) };
188             i += 1;
189 
190             if byte < 128 {
191                 // This could be a `1 => ...` case in the match below, but for
192                 // the common case of all-ASCII inputs, we bypass loading the
193                 // sizeable UTF8_CHAR_WIDTH table into cache.
194             } else {
195                 let w = utf8_char_width(byte);
196 
197                 match w {
198                     2 => {
199                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
200                             break;
201                         }
202                         i += 1;
203                     }
204                     3 => {
205                         match (byte, safe_get(self.source, i)) {
206                             (0xE0, 0xA0..=0xBF) => (),
207                             (0xE1..=0xEC, 0x80..=0xBF) => (),
208                             (0xED, 0x80..=0x9F) => (),
209                             (0xEE..=0xEF, 0x80..=0xBF) => (),
210                             _ => break,
211                         }
212                         i += 1;
213                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
214                             break;
215                         }
216                         i += 1;
217                     }
218                     4 => {
219                         match (byte, safe_get(self.source, i)) {
220                             (0xF0, 0x90..=0xBF) => (),
221                             (0xF1..=0xF3, 0x80..=0xBF) => (),
222                             (0xF4, 0x80..=0x8F) => (),
223                             _ => break,
224                         }
225                         i += 1;
226                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
227                             break;
228                         }
229                         i += 1;
230                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
231                             break;
232                         }
233                         i += 1;
234                     }
235                     _ => break,
236                 }
237             }
238 
239             valid_up_to = i;
240         }
241 
242         /// # Safety
243         /// `index` must be in-bounds for `x`
244         unsafe fn split_at_unchecked(x: &[u8], index: usize) -> (&[u8], &[u8]) {
245             // SAFETY: in-bounds as promised by the caller
246             unsafe { (x.get_unchecked(..index), x.get_unchecked(index..)) }
247         }
248 
249         // SAFETY: `i <= self.source.len()` because it is only ever incremented
250         // via `i += 1` and in between every single one of those increments, `i`
251         // is compared against `self.source.len()`. That happens either
252         // literally by `i < self.source.len()` in the while-loop's condition,
253         // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
254         // loop is terminated as soon as the latest `i += 1` has made `i` no
255         // longer less than `self.source.len()`, which means it'll be at most
256         // equal to `self.source.len()`.
257         let (inspected, remaining) = unsafe { split_at_unchecked(self.source, i) };
258         self.source = remaining;
259 
260         // SAFETY: `valid_up_to <= i` because it is only ever assigned via
261         // `valid_up_to = i` and `i` only increases.
262         let (valid, invalid) = unsafe { split_at_unchecked(inspected, valid_up_to) };
263 
264         Some(Utf8Chunk {
265             // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
266             valid: unsafe { from_utf8_unchecked(valid) },
267             invalid,
268         })
269     }
270 }
271 
272 impl FusedIterator for Utf8Chunks<'_> {}
273 
274 impl fmt::Debug for Utf8Chunks<'_> {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result275     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
276         f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
277     }
278 }
279 
280 // https://tools.ietf.org/html/rfc3629
281 const UTF8_CHAR_WIDTH: &[u8; 256] = &[
282     // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
283     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
284     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
285     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
286     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
287     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
288     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
289     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
290     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
291     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
292     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
293     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
294     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
295     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
296     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
297     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
298     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
299 ];
300 
301 /// Given a first byte, determines how many bytes are in this UTF-8 character.
302 #[must_use]
303 #[inline]
utf8_char_width(b: u8) -> usize304 const fn utf8_char_width(b: u8) -> usize {
305     UTF8_CHAR_WIDTH[b as usize] as usize
306 }
307