• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use core::char;
2 use core::cmp;
3 use core::fmt;
4 use core::str;
5 #[cfg(feature = "std")]
6 use std::error;
7 
8 use crate::ascii;
9 use crate::bstr::BStr;
10 use crate::ext_slice::ByteSlice;
11 
12 // The UTF-8 decoder provided here is based on the one presented here:
13 // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
14 //
15 // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
16 // using regex-automata that is roughly the same size. The real benefit of
17 // Hoehrmann's formulation is that the byte class mapping below is manually
18 // tailored such that each byte's class doubles as a shift to mask out the
19 // bits necessary for constructing the leading bits of each codepoint value
20 // from the initial byte.
21 //
22 // There are some minor differences between this implementation and Hoehrmann's
23 // formulation.
24 //
25 // Firstly, we make REJECT have state ID 0, since it makes the state table
26 // itself a little easier to read and is consistent with the notion that 0
27 // means "false" or "bad."
28 //
29 // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
30 // path.
31 //
32 // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
33 // in the core decoding loop. (Which is what regex-automata would do by
34 // default.)
35 //
36 // Fourthly, we split the byte class mapping and transition table into two
37 // arrays because it's clearer.
38 //
39 // It is unlikely that this is the fastest way to do UTF-8 decoding, however,
40 // it is fairly simple.
41 
42 const ACCEPT: usize = 12;
43 const REJECT: usize = 0;
44 
45 /// SAFETY: The decode below function relies on the correctness of these
46 /// equivalence classes.
47 #[cfg_attr(rustfmt, rustfmt::skip)]
48 const CLASSES: [u8; 256] = [
49    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
54    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
55    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
56   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
57 ];
58 
59 /// SAFETY: The decode below function relies on the correctness of this state
60 /// machine.
61 #[cfg_attr(rustfmt, rustfmt::skip)]
62 const STATES_FORWARD: &'static [u8] = &[
63   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64   12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
65   0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
66   0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
67   0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
68   0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
69   0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
70   0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
71   0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 ];
73 
74 /// An iterator over Unicode scalar values in a byte string.
75 ///
76 /// When invalid UTF-8 byte sequences are found, they are substituted with the
77 /// Unicode replacement codepoint (`U+FFFD`) using the
78 /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
79 ///
80 /// This iterator is created by the
81 /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
82 /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
83 #[derive(Clone, Debug)]
84 pub struct Chars<'a> {
85     bs: &'a [u8],
86 }
87 
88 impl<'a> Chars<'a> {
new(bs: &'a [u8]) -> Chars<'a>89     pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
90         Chars { bs }
91     }
92 
93     /// View the underlying data as a subslice of the original data.
94     ///
95     /// The slice returned has the same lifetime as the original slice, and so
96     /// the iterator can continue to be used while this exists.
97     ///
98     /// # Examples
99     ///
100     /// ```
101     /// use bstr::ByteSlice;
102     ///
103     /// let mut chars = b"abc".chars();
104     ///
105     /// assert_eq!(b"abc", chars.as_bytes());
106     /// chars.next();
107     /// assert_eq!(b"bc", chars.as_bytes());
108     /// chars.next();
109     /// chars.next();
110     /// assert_eq!(b"", chars.as_bytes());
111     /// ```
112     #[inline]
as_bytes(&self) -> &'a [u8]113     pub fn as_bytes(&self) -> &'a [u8] {
114         self.bs
115     }
116 }
117 
118 impl<'a> Iterator for Chars<'a> {
119     type Item = char;
120 
121     #[inline]
next(&mut self) -> Option<char>122     fn next(&mut self) -> Option<char> {
123         let (ch, size) = decode_lossy(self.bs);
124         if size == 0 {
125             return None;
126         }
127         self.bs = &self.bs[size..];
128         Some(ch)
129     }
130 }
131 
132 impl<'a> DoubleEndedIterator for Chars<'a> {
133     #[inline]
next_back(&mut self) -> Option<char>134     fn next_back(&mut self) -> Option<char> {
135         let (ch, size) = decode_last_lossy(self.bs);
136         if size == 0 {
137             return None;
138         }
139         self.bs = &self.bs[..self.bs.len() - size];
140         Some(ch)
141     }
142 }
143 
144 /// An iterator over Unicode scalar values in a byte string and their
145 /// byte index positions.
146 ///
147 /// When invalid UTF-8 byte sequences are found, they are substituted with the
148 /// Unicode replacement codepoint (`U+FFFD`) using the
149 /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
150 ///
151 /// Note that this is slightly different from the `CharIndices` iterator
152 /// provided by the standard library. Aside from working on possibly invalid
153 /// UTF-8, this iterator provides both the corresponding starting and ending
154 /// byte indices of each codepoint yielded. The ending position is necessary to
155 /// slice the original byte string when invalid UTF-8 bytes are converted into
156 /// a Unicode replacement codepoint, since a single replacement codepoint can
157 /// substitute anywhere from 1 to 3 invalid bytes (inclusive).
158 ///
159 /// This iterator is created by the
160 /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
161 /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
162 #[derive(Clone, Debug)]
163 pub struct CharIndices<'a> {
164     bs: &'a [u8],
165     forward_index: usize,
166     reverse_index: usize,
167 }
168 
169 impl<'a> CharIndices<'a> {
new(bs: &'a [u8]) -> CharIndices<'a>170     pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
171         CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
172     }
173 
174     /// View the underlying data as a subslice of the original data.
175     ///
176     /// The slice returned has the same lifetime as the original slice, and so
177     /// the iterator can continue to be used while this exists.
178     ///
179     /// # Examples
180     ///
181     /// ```
182     /// use bstr::ByteSlice;
183     ///
184     /// let mut it = b"abc".char_indices();
185     ///
186     /// assert_eq!(b"abc", it.as_bytes());
187     /// it.next();
188     /// assert_eq!(b"bc", it.as_bytes());
189     /// it.next();
190     /// it.next();
191     /// assert_eq!(b"", it.as_bytes());
192     /// ```
193     #[inline]
as_bytes(&self) -> &'a [u8]194     pub fn as_bytes(&self) -> &'a [u8] {
195         self.bs
196     }
197 }
198 
199 impl<'a> Iterator for CharIndices<'a> {
200     type Item = (usize, usize, char);
201 
202     #[inline]
next(&mut self) -> Option<(usize, usize, char)>203     fn next(&mut self) -> Option<(usize, usize, char)> {
204         let index = self.forward_index;
205         let (ch, size) = decode_lossy(self.bs);
206         if size == 0 {
207             return None;
208         }
209         self.bs = &self.bs[size..];
210         self.forward_index += size;
211         Some((index, index + size, ch))
212     }
213 }
214 
215 impl<'a> DoubleEndedIterator for CharIndices<'a> {
216     #[inline]
next_back(&mut self) -> Option<(usize, usize, char)>217     fn next_back(&mut self) -> Option<(usize, usize, char)> {
218         let (ch, size) = decode_last_lossy(self.bs);
219         if size == 0 {
220             return None;
221         }
222         self.bs = &self.bs[..self.bs.len() - size];
223         self.reverse_index -= size;
224         Some((self.reverse_index, self.reverse_index + size, ch))
225     }
226 }
227 
228 impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
229 
230 /// An iterator over chunks of valid UTF-8 in a byte slice.
231 ///
232 /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
233 #[derive(Clone, Debug)]
234 pub struct Utf8Chunks<'a> {
235     pub(super) bytes: &'a [u8],
236 }
237 
238 /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
239 ///
240 /// This is yielded by the
241 /// [`Utf8Chunks`](struct.Utf8Chunks.html)
242 /// iterator, which can be created via the
243 /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
244 /// method.
245 ///
246 /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
247 /// are being iterated over.
248 #[cfg_attr(test, derive(Debug, PartialEq))]
249 pub struct Utf8Chunk<'a> {
250     /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
251     ///
252     /// This is empty between adjacent invalid UTF-8 byte sequences.
253     valid: &'a str,
254     /// A sequence of invalid UTF-8 bytes.
255     ///
256     /// Can only be empty in the last chunk.
257     ///
258     /// Should be replaced by a single unicode replacement character, if not
259     /// empty.
260     invalid: &'a BStr,
261     /// Indicates whether the invalid sequence could've been valid if there
262     /// were more bytes.
263     ///
264     /// Can only be true in the last chunk.
265     incomplete: bool,
266 }
267 
268 impl<'a> Utf8Chunk<'a> {
269     /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
270     ///
271     /// This may be empty if there are consecutive sequences of invalid UTF-8
272     /// bytes.
273     #[inline]
valid(&self) -> &'a str274     pub fn valid(&self) -> &'a str {
275         self.valid
276     }
277 
278     /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
279     /// immediately follow the valid UTF-8 bytes in this chunk.
280     ///
281     /// This is only empty when this chunk corresponds to the last chunk in
282     /// the original bytes.
283     ///
284     /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
285     /// sequences greater than 1 always correspond to a valid _prefix_ of
286     /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
287     /// of maximal subparts" strategy that is described in more detail in the
288     /// docs for the
289     /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
290     /// method.
291     #[inline]
invalid(&self) -> &'a [u8]292     pub fn invalid(&self) -> &'a [u8] {
293         self.invalid.as_bytes()
294     }
295 
296     /// Returns whether the invalid sequence might still become valid if more
297     /// bytes are added.
298     ///
299     /// Returns true if the end of the input was reached unexpectedly,
300     /// without encountering an unexpected byte.
301     ///
302     /// This can only be the case for the last chunk.
303     #[inline]
incomplete(&self) -> bool304     pub fn incomplete(&self) -> bool {
305         self.incomplete
306     }
307 }
308 
309 impl<'a> Iterator for Utf8Chunks<'a> {
310     type Item = Utf8Chunk<'a>;
311 
312     #[inline]
next(&mut self) -> Option<Utf8Chunk<'a>>313     fn next(&mut self) -> Option<Utf8Chunk<'a>> {
314         if self.bytes.is_empty() {
315             return None;
316         }
317         match validate(self.bytes) {
318             Ok(()) => {
319                 let valid = self.bytes;
320                 self.bytes = &[];
321                 Some(Utf8Chunk {
322                     // SAFETY: This is safe because of the guarantees provided
323                     // by utf8::validate.
324                     valid: unsafe { str::from_utf8_unchecked(valid) },
325                     invalid: [].as_bstr(),
326                     incomplete: false,
327                 })
328             }
329             Err(e) => {
330                 let (valid, rest) = self.bytes.split_at(e.valid_up_to());
331                 // SAFETY: This is safe because of the guarantees provided by
332                 // utf8::validate.
333                 let valid = unsafe { str::from_utf8_unchecked(valid) };
334                 let (invalid_len, incomplete) = match e.error_len() {
335                     Some(n) => (n, false),
336                     None => (rest.len(), true),
337                 };
338                 let (invalid, rest) = rest.split_at(invalid_len);
339                 self.bytes = rest;
340                 Some(Utf8Chunk {
341                     valid,
342                     invalid: invalid.as_bstr(),
343                     incomplete,
344                 })
345             }
346         }
347     }
348 
349     #[inline]
size_hint(&self) -> (usize, Option<usize>)350     fn size_hint(&self) -> (usize, Option<usize>) {
351         if self.bytes.is_empty() {
352             (0, Some(0))
353         } else {
354             (1, Some(self.bytes.len()))
355         }
356     }
357 }
358 
359 impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
360 
361 /// An error that occurs when UTF-8 decoding fails.
362 ///
363 /// This error occurs when attempting to convert a non-UTF-8 byte
364 /// string to a Rust string that must be valid UTF-8. For example,
365 /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
366 ///
367 /// # Example
368 ///
369 /// This example shows what happens when a given byte sequence is invalid,
370 /// but ends with a sequence that is a possible prefix of valid UTF-8.
371 ///
372 /// ```
373 /// use bstr::{B, ByteSlice};
374 ///
375 /// let s = B(b"foobar\xF1\x80\x80");
376 /// let err = s.to_str().unwrap_err();
377 /// assert_eq!(err.valid_up_to(), 6);
378 /// assert_eq!(err.error_len(), None);
379 /// ```
380 ///
381 /// This example shows what happens when a given byte sequence contains
382 /// invalid UTF-8.
383 ///
384 /// ```
385 /// use bstr::ByteSlice;
386 ///
387 /// let s = b"foobar\xF1\x80\x80quux";
388 /// let err = s.to_str().unwrap_err();
389 /// assert_eq!(err.valid_up_to(), 6);
390 /// // The error length reports the maximum number of bytes that correspond to
391 /// // a valid prefix of a UTF-8 encoded codepoint.
392 /// assert_eq!(err.error_len(), Some(3));
393 ///
394 /// // In contrast to the above which contains a single invalid prefix,
395 /// // consider the case of multiple individal bytes that are never valid
396 /// // prefixes. Note how the value of error_len changes!
397 /// let s = b"foobar\xFF\xFFquux";
398 /// let err = s.to_str().unwrap_err();
399 /// assert_eq!(err.valid_up_to(), 6);
400 /// assert_eq!(err.error_len(), Some(1));
401 ///
402 /// // The fact that it's an invalid prefix does not change error_len even
403 /// // when it immediately precedes the end of the string.
404 /// let s = b"foobar\xFF";
405 /// let err = s.to_str().unwrap_err();
406 /// assert_eq!(err.valid_up_to(), 6);
407 /// assert_eq!(err.error_len(), Some(1));
408 /// ```
409 #[derive(Debug, Eq, PartialEq)]
410 pub struct Utf8Error {
411     valid_up_to: usize,
412     error_len: Option<usize>,
413 }
414 
415 impl Utf8Error {
416     /// Returns the byte index of the position immediately following the last
417     /// valid UTF-8 byte.
418     ///
419     /// # Example
420     ///
421     /// This examples shows how `valid_up_to` can be used to retrieve a
422     /// possibly empty prefix that is guaranteed to be valid UTF-8:
423     ///
424     /// ```
425     /// use bstr::ByteSlice;
426     ///
427     /// let s = b"foobar\xF1\x80\x80quux";
428     /// let err = s.to_str().unwrap_err();
429     ///
430     /// // This is guaranteed to never panic.
431     /// let string = s[..err.valid_up_to()].to_str().unwrap();
432     /// assert_eq!(string, "foobar");
433     /// ```
434     #[inline]
valid_up_to(&self) -> usize435     pub fn valid_up_to(&self) -> usize {
436         self.valid_up_to
437     }
438 
439     /// Returns the total number of invalid UTF-8 bytes immediately following
440     /// the position returned by `valid_up_to`. This value is always at least
441     /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
442     /// encoded codepoint.
443     ///
444     /// If the end of the original input was found before a valid UTF-8 encoded
445     /// codepoint could be completed, then this returns `None`. This is useful
446     /// when processing streams, where a `None` value signals that more input
447     /// might be needed.
448     #[inline]
error_len(&self) -> Option<usize>449     pub fn error_len(&self) -> Option<usize> {
450         self.error_len
451     }
452 }
453 
454 #[cfg(feature = "std")]
455 impl error::Error for Utf8Error {
description(&self) -> &str456     fn description(&self) -> &str {
457         "invalid UTF-8"
458     }
459 }
460 
461 impl fmt::Display for Utf8Error {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result462     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
463         write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
464     }
465 }
466 
467 /// Returns OK if and only if the given slice is completely valid UTF-8.
468 ///
469 /// If the slice isn't valid UTF-8, then an error is returned that explains
470 /// the first location at which invalid UTF-8 was detected.
validate(slice: &[u8]) -> Result<(), Utf8Error>471 pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
472     // The fast path for validating UTF-8. It steps through a UTF-8 automaton
473     // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
474     // detected, it backs up and runs the slower version of the UTF-8 automaton
475     // to determine correct error information.
476     fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
477         let mut state = ACCEPT;
478         let mut i = 0;
479 
480         while i < slice.len() {
481             let b = slice[i];
482 
483             // ASCII fast path. If we see two consecutive ASCII bytes, then try
484             // to validate as much ASCII as possible very quickly.
485             if state == ACCEPT
486                 && b <= 0x7F
487                 && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
488             {
489                 i += ascii::first_non_ascii_byte(&slice[i..]);
490                 continue;
491             }
492 
493             state = step(state, b);
494             if state == REJECT {
495                 return Err(find_valid_up_to(slice, i));
496             }
497             i += 1;
498         }
499         if state != ACCEPT {
500             Err(find_valid_up_to(slice, slice.len()))
501         } else {
502             Ok(())
503         }
504     }
505 
506     // Given the first position at which a UTF-8 sequence was determined to be
507     // invalid, return an error that correctly reports the position at which
508     // the last complete UTF-8 sequence ends.
509     #[inline(never)]
510     fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
511         // In order to find the last valid byte, we need to back up an amount
512         // that guarantees every preceding byte is part of a valid UTF-8
513         // code unit sequence. To do this, we simply locate the last leading
514         // byte that occurs before rejected_at.
515         let mut backup = rejected_at.saturating_sub(1);
516         while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
517             backup -= 1;
518         }
519         let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
520         let mut err = slow(&slice[backup..upto]).unwrap_err();
521         err.valid_up_to += backup;
522         err
523     }
524 
525     // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
526     // when an invalid sequence is found. This is split out from validate so
527     // that the fast path doesn't need to keep track of the position of the
528     // last valid UTF-8 byte. In particular, tracking this requires checking
529     // for an ACCEPT state on each byte, which degrades throughput pretty
530     // badly.
531     fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
532         let mut state = ACCEPT;
533         let mut valid_up_to = 0;
534         for (i, &b) in slice.iter().enumerate() {
535             state = step(state, b);
536             if state == ACCEPT {
537                 valid_up_to = i + 1;
538             } else if state == REJECT {
539                 // Our error length must always be at least 1.
540                 let error_len = Some(cmp::max(1, i - valid_up_to));
541                 return Err(Utf8Error { valid_up_to, error_len });
542             }
543         }
544         if state != ACCEPT {
545             Err(Utf8Error { valid_up_to, error_len: None })
546         } else {
547             Ok(())
548         }
549     }
550 
551     // Advance to the next state given the current state and current byte.
552     fn step(state: usize, b: u8) -> usize {
553         let class = CLASSES[b as usize];
554         // SAFETY: This is safe because 'class' is always <=11 and 'state' is
555         // always <=96. Therefore, the maximal index is 96+11 = 107, where
556         // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
557         // valid by construction of the state machine and the byte equivalence
558         // classes.
559         unsafe {
560             *STATES_FORWARD.get_unchecked(state + class as usize) as usize
561         }
562     }
563 
564     fast(slice)
565 }
566 
567 /// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
568 ///
569 /// When successful, the corresponding Unicode scalar value is returned along
570 /// with the number of bytes it was encoded with. The number of bytes consumed
571 /// for a successful decode is always between 1 and 4, inclusive.
572 ///
573 /// When unsuccessful, `None` is returned along with the number of bytes that
574 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
575 /// the number of bytes consumed is always between 0 and 3, inclusive, where
576 /// 0 is only returned when `slice` is empty.
577 ///
578 /// # Examples
579 ///
580 /// Basic usage:
581 ///
582 /// ```
583 /// use bstr::decode_utf8;
584 ///
585 /// // Decoding a valid codepoint.
586 /// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
587 /// assert_eq!(Some('☃'), ch);
588 /// assert_eq!(3, size);
589 ///
590 /// // Decoding an incomplete codepoint.
591 /// let (ch, size) = decode_utf8(b"\xE2\x98");
592 /// assert_eq!(None, ch);
593 /// assert_eq!(2, size);
594 /// ```
595 ///
596 /// This example shows how to iterate over all codepoints in UTF-8 encoded
597 /// bytes, while replacing invalid UTF-8 sequences with the replacement
598 /// codepoint:
599 ///
600 /// ```
601 /// use bstr::{B, decode_utf8};
602 ///
603 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
604 /// let mut chars = vec![];
605 /// while !bytes.is_empty() {
606 ///     let (ch, size) = decode_utf8(bytes);
607 ///     bytes = &bytes[size..];
608 ///     chars.push(ch.unwrap_or('\u{FFFD}'));
609 /// }
610 /// assert_eq!(vec!['☃', '\u{FFFD}', '��', '\u{FFFD}', 'a'], chars);
611 /// ```
612 #[inline]
decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize)613 pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
614     let slice = slice.as_ref();
615     match slice.get(0) {
616         None => return (None, 0),
617         Some(&b) if b <= 0x7F => return (Some(b as char), 1),
618         _ => {}
619     }
620 
621     let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
622     while i < slice.len() {
623         decode_step(&mut state, &mut cp, slice[i]);
624         i += 1;
625 
626         if state == ACCEPT {
627             // SAFETY: This is safe because `decode_step` guarantees that
628             // `cp` is a valid Unicode scalar value in an ACCEPT state.
629             let ch = unsafe { char::from_u32_unchecked(cp) };
630             return (Some(ch), i);
631         } else if state == REJECT {
632             // At this point, we always want to advance at least one byte.
633             return (None, cmp::max(1, i.saturating_sub(1)));
634         }
635     }
636     (None, i)
637 }
638 
639 /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
640 /// slice.
641 ///
642 /// When successful, the corresponding Unicode scalar value is returned along
643 /// with the number of bytes it was encoded with. The number of bytes consumed
644 /// for a successful decode is always between 1 and 4, inclusive.
645 ///
646 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
647 /// along with the number of bytes that make up a maximal prefix of a valid
648 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
649 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
650 /// empty.
651 ///
652 /// # Examples
653 ///
654 /// Basic usage:
655 ///
656 /// ```ignore
657 /// use bstr::decode_utf8_lossy;
658 ///
659 /// // Decoding a valid codepoint.
660 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
661 /// assert_eq!('☃', ch);
662 /// assert_eq!(3, size);
663 ///
664 /// // Decoding an incomplete codepoint.
665 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
666 /// assert_eq!('\u{FFFD}', ch);
667 /// assert_eq!(2, size);
668 /// ```
669 ///
670 /// This example shows how to iterate over all codepoints in UTF-8 encoded
671 /// bytes, while replacing invalid UTF-8 sequences with the replacement
672 /// codepoint:
673 ///
674 /// ```ignore
675 /// use bstr::{B, decode_utf8_lossy};
676 ///
677 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
678 /// let mut chars = vec![];
679 /// while !bytes.is_empty() {
680 ///     let (ch, size) = decode_utf8_lossy(bytes);
681 ///     bytes = &bytes[size..];
682 ///     chars.push(ch);
683 /// }
684 /// assert_eq!(vec!['☃', '\u{FFFD}', '��', '\u{FFFD}', 'a'], chars);
685 /// ```
686 #[inline]
decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize)687 pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
688     match decode(slice) {
689         (Some(ch), size) => (ch, size),
690         (None, size) => ('\u{FFFD}', size),
691     }
692 }
693 
694 /// UTF-8 decode a single Unicode scalar value from the end of a slice.
695 ///
696 /// When successful, the corresponding Unicode scalar value is returned along
697 /// with the number of bytes it was encoded with. The number of bytes consumed
698 /// for a successful decode is always between 1 and 4, inclusive.
699 ///
700 /// When unsuccessful, `None` is returned along with the number of bytes that
701 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
702 /// the number of bytes consumed is always between 0 and 3, inclusive, where
703 /// 0 is only returned when `slice` is empty.
704 ///
705 /// # Examples
706 ///
707 /// Basic usage:
708 ///
709 /// ```
710 /// use bstr::decode_last_utf8;
711 ///
712 /// // Decoding a valid codepoint.
713 /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
714 /// assert_eq!(Some('☃'), ch);
715 /// assert_eq!(3, size);
716 ///
717 /// // Decoding an incomplete codepoint.
718 /// let (ch, size) = decode_last_utf8(b"\xE2\x98");
719 /// assert_eq!(None, ch);
720 /// assert_eq!(2, size);
721 /// ```
722 ///
723 /// This example shows how to iterate over all codepoints in UTF-8 encoded
724 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
725 /// replacement codepoint:
726 ///
727 /// ```
728 /// use bstr::{B, decode_last_utf8};
729 ///
730 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
731 /// let mut chars = vec![];
732 /// while !bytes.is_empty() {
733 ///     let (ch, size) = decode_last_utf8(bytes);
734 ///     bytes = &bytes[..bytes.len()-size];
735 ///     chars.push(ch.unwrap_or('\u{FFFD}'));
736 /// }
737 /// assert_eq!(vec!['a', '\u{FFFD}', '��', '\u{FFFD}', '☃'], chars);
738 /// ```
739 #[inline]
decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize)740 pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
741     // TODO: We could implement this by reversing the UTF-8 automaton, but for
742     // now, we do it the slow way by using the forward automaton.
743 
744     let slice = slice.as_ref();
745     if slice.is_empty() {
746         return (None, 0);
747     }
748     let mut start = slice.len() - 1;
749     let limit = slice.len().saturating_sub(4);
750     while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
751         start -= 1;
752     }
753     let (ch, size) = decode(&slice[start..]);
754     // If we didn't consume all of the bytes, then that means there's at least
755     // one stray byte that never occurs in a valid code unit prefix, so we can
756     // advance by one byte.
757     if start + size != slice.len() {
758         (None, 1)
759     } else {
760         (ch, size)
761     }
762 }
763 
764 /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
765 ///
766 /// When successful, the corresponding Unicode scalar value is returned along
767 /// with the number of bytes it was encoded with. The number of bytes consumed
768 /// for a successful decode is always between 1 and 4, inclusive.
769 ///
770 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
771 /// along with the number of bytes that make up a maximal prefix of a valid
772 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
773 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
774 /// empty.
775 ///
776 /// # Examples
777 ///
778 /// Basic usage:
779 ///
780 /// ```ignore
781 /// use bstr::decode_last_utf8_lossy;
782 ///
783 /// // Decoding a valid codepoint.
784 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
785 /// assert_eq!('☃', ch);
786 /// assert_eq!(3, size);
787 ///
788 /// // Decoding an incomplete codepoint.
789 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
790 /// assert_eq!('\u{FFFD}', ch);
791 /// assert_eq!(2, size);
792 /// ```
793 ///
794 /// This example shows how to iterate over all codepoints in UTF-8 encoded
795 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
796 /// replacement codepoint:
797 ///
798 /// ```ignore
799 /// use bstr::decode_last_utf8_lossy;
800 ///
801 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
802 /// let mut chars = vec![];
803 /// while !bytes.is_empty() {
804 ///     let (ch, size) = decode_last_utf8_lossy(bytes);
805 ///     bytes = &bytes[..bytes.len()-size];
806 ///     chars.push(ch);
807 /// }
808 /// assert_eq!(vec!['a', '\u{FFFD}', '��', '\u{FFFD}', '☃'], chars);
809 /// ```
810 #[inline]
decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize)811 pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
812     match decode_last(slice) {
813         (Some(ch), size) => (ch, size),
814         (None, size) => ('\u{FFFD}', size),
815     }
816 }
817 
818 /// SAFETY: The decode function relies on state being equal to ACCEPT only if
819 /// cp is a valid Unicode scalar value.
820 #[inline]
decode_step(state: &mut usize, cp: &mut u32, b: u8)821 pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
822     let class = CLASSES[b as usize];
823     if *state == ACCEPT {
824         *cp = (0xFF >> class) & (b as u32);
825     } else {
826         *cp = (b as u32 & 0b111111) | (*cp << 6);
827     }
828     *state = STATES_FORWARD[*state + class as usize] as usize;
829 }
830 
831 /// Returns true if and only if the given byte is either a valid leading UTF-8
832 /// byte, or is otherwise an invalid byte that can never appear anywhere in a
833 /// valid UTF-8 sequence.
is_leading_or_invalid_utf8_byte(b: u8) -> bool834 fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
835     // In the ASCII case, the most significant bit is never set. The leading
836     // byte of a 2/3/4-byte sequence always has the top two most significant
837     // bits set. For bytes that can never appear anywhere in valid UTF-8, this
838     // also returns true, since every such byte has its two most significant
839     // bits set:
840     //
841     //     \xC0 :: 11000000
842     //     \xC1 :: 11000001
843     //     \xF5 :: 11110101
844     //     \xF6 :: 11110110
845     //     \xF7 :: 11110111
846     //     \xF8 :: 11111000
847     //     \xF9 :: 11111001
848     //     \xFA :: 11111010
849     //     \xFB :: 11111011
850     //     \xFC :: 11111100
851     //     \xFD :: 11111101
852     //     \xFE :: 11111110
853     //     \xFF :: 11111111
854     (b & 0b1100_0000) != 0b1000_0000
855 }
856 
857 #[cfg(test)]
858 mod tests {
859     use std::char;
860 
861     use crate::ext_slice::{ByteSlice, B};
862     use crate::tests::LOSSY_TESTS;
863     use crate::utf8::{self, Utf8Error};
864 
utf8e(valid_up_to: usize) -> Utf8Error865     fn utf8e(valid_up_to: usize) -> Utf8Error {
866         Utf8Error { valid_up_to, error_len: None }
867     }
868 
utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error869     fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
870         Utf8Error { valid_up_to, error_len: Some(error_len) }
871     }
872 
873     #[test]
validate_all_codepoints()874     fn validate_all_codepoints() {
875         for i in 0..(0x10FFFF + 1) {
876             let cp = match char::from_u32(i) {
877                 None => continue,
878                 Some(cp) => cp,
879             };
880             let mut buf = [0; 4];
881             let s = cp.encode_utf8(&mut buf);
882             assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
883         }
884     }
885 
886     #[test]
validate_multiple_codepoints()887     fn validate_multiple_codepoints() {
888         assert_eq!(Ok(()), utf8::validate(b"abc"));
889         assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
890         assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
891         assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
892         assert_eq!(
893             Ok(()),
894             utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
895         );
896         assert_eq!(
897             Ok(()),
898             utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
899         );
900     }
901 
902     #[test]
validate_errors()903     fn validate_errors() {
904         // single invalid byte
905         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
906         // single invalid byte after ASCII
907         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
908         // single invalid byte after 2 byte sequence
909         assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
910         // single invalid byte after 3 byte sequence
911         assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
912         // single invalid byte after 4 byte sequence
913         assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
914 
915         // An invalid 2-byte sequence with a valid 1-byte prefix.
916         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
917         // An invalid 3-byte sequence with a valid 2-byte prefix.
918         assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
919         // An invalid 4-byte sequence with a valid 3-byte prefix.
920         assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
921 
922         // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
923         // same codepoint value in 4 bytes. This not only tests that we reject
924         // overlong sequences, but that we get valid_up_to correct.
925         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
926         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
927         assert_eq!(
928             Err(utf8e2(3, 1)),
929             utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
930         );
931 
932         // Check that encoding a surrogate codepoint using the UTF-8 scheme
933         // fails validation.
934         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
935         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
936         assert_eq!(
937             Err(utf8e2(3, 1)),
938             utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
939         );
940 
941         // Check that an incomplete 2-byte sequence fails.
942         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
943         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
944         assert_eq!(
945             Err(utf8e2(3, 1)),
946             utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
947         );
948         // Check that an incomplete 3-byte sequence fails.
949         assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
950         assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
951         assert_eq!(
952             Err(utf8e2(3, 2)),
953             utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
954         );
955         // Check that an incomplete 4-byte sequence fails.
956         assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
957         assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
958         assert_eq!(
959             Err(utf8e2(4, 3)),
960             utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
961         );
962         assert_eq!(
963             Err(utf8e2(6, 3)),
964             utf8::validate(b"foobar\xF1\x80\x80quux",)
965         );
966 
967         // Check that an incomplete (EOF) 2-byte sequence fails.
968         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
969         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
970         assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
971         // Check that an incomplete (EOF) 3-byte sequence fails.
972         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
973         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
974         assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
975         // Check that an incomplete (EOF) 4-byte sequence fails.
976         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
977         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
978         assert_eq!(
979             Err(utf8e(4)),
980             utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
981         );
982 
983         // Test that we errors correct even after long valid sequences. This
984         // checks that our "backup" logic for detecting errors is correct.
985         assert_eq!(
986             Err(utf8e2(8, 1)),
987             utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
988         );
989     }
990 
991     #[test]
decode_valid()992     fn decode_valid() {
993         fn d(mut s: &str) -> Vec<char> {
994             let mut chars = vec![];
995             while !s.is_empty() {
996                 let (ch, size) = utf8::decode(s.as_bytes());
997                 s = &s[size..];
998                 chars.push(ch.unwrap());
999             }
1000             chars
1001         }
1002 
1003         assert_eq!(vec!['☃'], d("☃"));
1004         assert_eq!(vec!['☃', '☃'], d("☃☃"));
1005         assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1006         assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1007         assert_eq!(vec!['��', '��', '��', '��', '��'], d("����������"));
1008     }
1009 
1010     #[test]
decode_invalid()1011     fn decode_invalid() {
1012         let (ch, size) = utf8::decode(b"");
1013         assert_eq!(None, ch);
1014         assert_eq!(0, size);
1015 
1016         let (ch, size) = utf8::decode(b"\xFF");
1017         assert_eq!(None, ch);
1018         assert_eq!(1, size);
1019 
1020         let (ch, size) = utf8::decode(b"\xCE\xF0");
1021         assert_eq!(None, ch);
1022         assert_eq!(1, size);
1023 
1024         let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
1025         assert_eq!(None, ch);
1026         assert_eq!(2, size);
1027 
1028         let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
1029         assert_eq!(None, ch);
1030         assert_eq!(3, size);
1031 
1032         let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
1033         assert_eq!(None, ch);
1034         assert_eq!(3, size);
1035 
1036         let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
1037         assert_eq!(None, ch);
1038         assert_eq!(1, size);
1039 
1040         let (ch, size) = utf8::decode(b"\xED\xA0\x80");
1041         assert_eq!(None, ch);
1042         assert_eq!(1, size);
1043 
1044         let (ch, size) = utf8::decode(b"\xCEa");
1045         assert_eq!(None, ch);
1046         assert_eq!(1, size);
1047 
1048         let (ch, size) = utf8::decode(b"\xE2\x98a");
1049         assert_eq!(None, ch);
1050         assert_eq!(2, size);
1051 
1052         let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
1053         assert_eq!(None, ch);
1054         assert_eq!(3, size);
1055     }
1056 
1057     #[test]
decode_lossy()1058     fn decode_lossy() {
1059         let (ch, size) = utf8::decode_lossy(b"");
1060         assert_eq!('\u{FFFD}', ch);
1061         assert_eq!(0, size);
1062 
1063         let (ch, size) = utf8::decode_lossy(b"\xFF");
1064         assert_eq!('\u{FFFD}', ch);
1065         assert_eq!(1, size);
1066 
1067         let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
1068         assert_eq!('\u{FFFD}', ch);
1069         assert_eq!(1, size);
1070 
1071         let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
1072         assert_eq!('\u{FFFD}', ch);
1073         assert_eq!(2, size);
1074 
1075         let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
1076         assert_eq!('\u{FFFD}', ch);
1077         assert_eq!(3, size);
1078 
1079         let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
1080         assert_eq!('\u{FFFD}', ch);
1081         assert_eq!(1, size);
1082 
1083         let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
1084         assert_eq!('\u{FFFD}', ch);
1085         assert_eq!(1, size);
1086 
1087         let (ch, size) = utf8::decode_lossy(b"\xCEa");
1088         assert_eq!('\u{FFFD}', ch);
1089         assert_eq!(1, size);
1090 
1091         let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
1092         assert_eq!('\u{FFFD}', ch);
1093         assert_eq!(2, size);
1094 
1095         let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
1096         assert_eq!('\u{FFFD}', ch);
1097         assert_eq!(3, size);
1098     }
1099 
1100     #[test]
decode_last_valid()1101     fn decode_last_valid() {
1102         fn d(mut s: &str) -> Vec<char> {
1103             let mut chars = vec![];
1104             while !s.is_empty() {
1105                 let (ch, size) = utf8::decode_last(s.as_bytes());
1106                 s = &s[..s.len() - size];
1107                 chars.push(ch.unwrap());
1108             }
1109             chars
1110         }
1111 
1112         assert_eq!(vec!['☃'], d("☃"));
1113         assert_eq!(vec!['☃', '☃'], d("☃☃"));
1114         assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1115         assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1116         assert_eq!(vec!['��', '��', '��', '��', '��'], d("����������"));
1117     }
1118 
1119     #[test]
decode_last_invalid()1120     fn decode_last_invalid() {
1121         let (ch, size) = utf8::decode_last(b"");
1122         assert_eq!(None, ch);
1123         assert_eq!(0, size);
1124 
1125         let (ch, size) = utf8::decode_last(b"\xFF");
1126         assert_eq!(None, ch);
1127         assert_eq!(1, size);
1128 
1129         let (ch, size) = utf8::decode_last(b"\xCE\xF0");
1130         assert_eq!(None, ch);
1131         assert_eq!(1, size);
1132 
1133         let (ch, size) = utf8::decode_last(b"\xCE");
1134         assert_eq!(None, ch);
1135         assert_eq!(1, size);
1136 
1137         let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
1138         assert_eq!(None, ch);
1139         assert_eq!(1, size);
1140 
1141         let (ch, size) = utf8::decode_last(b"\xE2\x98");
1142         assert_eq!(None, ch);
1143         assert_eq!(2, size);
1144 
1145         let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
1146         assert_eq!(None, ch);
1147         assert_eq!(1, size);
1148 
1149         let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
1150         assert_eq!(None, ch);
1151         assert_eq!(3, size);
1152 
1153         let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
1154         assert_eq!(None, ch);
1155         assert_eq!(1, size);
1156 
1157         let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
1158         assert_eq!(None, ch);
1159         assert_eq!(1, size);
1160 
1161         let (ch, size) = utf8::decode_last(b"\xED\xA0");
1162         assert_eq!(None, ch);
1163         assert_eq!(1, size);
1164 
1165         let (ch, size) = utf8::decode_last(b"\xED");
1166         assert_eq!(None, ch);
1167         assert_eq!(1, size);
1168 
1169         let (ch, size) = utf8::decode_last(b"a\xCE");
1170         assert_eq!(None, ch);
1171         assert_eq!(1, size);
1172 
1173         let (ch, size) = utf8::decode_last(b"a\xE2\x98");
1174         assert_eq!(None, ch);
1175         assert_eq!(2, size);
1176 
1177         let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
1178         assert_eq!(None, ch);
1179         assert_eq!(3, size);
1180     }
1181 
1182     #[test]
decode_last_lossy()1183     fn decode_last_lossy() {
1184         let (ch, size) = utf8::decode_last_lossy(b"");
1185         assert_eq!('\u{FFFD}', ch);
1186         assert_eq!(0, size);
1187 
1188         let (ch, size) = utf8::decode_last_lossy(b"\xFF");
1189         assert_eq!('\u{FFFD}', ch);
1190         assert_eq!(1, size);
1191 
1192         let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
1193         assert_eq!('\u{FFFD}', ch);
1194         assert_eq!(1, size);
1195 
1196         let (ch, size) = utf8::decode_last_lossy(b"\xCE");
1197         assert_eq!('\u{FFFD}', ch);
1198         assert_eq!(1, size);
1199 
1200         let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
1201         assert_eq!('\u{FFFD}', ch);
1202         assert_eq!(1, size);
1203 
1204         let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
1205         assert_eq!('\u{FFFD}', ch);
1206         assert_eq!(2, size);
1207 
1208         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
1209         assert_eq!('\u{FFFD}', ch);
1210         assert_eq!(1, size);
1211 
1212         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
1213         assert_eq!('\u{FFFD}', ch);
1214         assert_eq!(3, size);
1215 
1216         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
1217         assert_eq!('\u{FFFD}', ch);
1218         assert_eq!(1, size);
1219 
1220         let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
1221         assert_eq!('\u{FFFD}', ch);
1222         assert_eq!(1, size);
1223 
1224         let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
1225         assert_eq!('\u{FFFD}', ch);
1226         assert_eq!(1, size);
1227 
1228         let (ch, size) = utf8::decode_last_lossy(b"\xED");
1229         assert_eq!('\u{FFFD}', ch);
1230         assert_eq!(1, size);
1231 
1232         let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
1233         assert_eq!('\u{FFFD}', ch);
1234         assert_eq!(1, size);
1235 
1236         let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
1237         assert_eq!('\u{FFFD}', ch);
1238         assert_eq!(2, size);
1239 
1240         let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
1241         assert_eq!('\u{FFFD}', ch);
1242         assert_eq!(3, size);
1243     }
1244 
1245     #[test]
chars()1246     fn chars() {
1247         for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1248             let got: String = B(input).chars().collect();
1249             assert_eq!(
1250                 expected, got,
1251                 "chars(ith: {:?}, given: {:?})",
1252                 i, input,
1253             );
1254             let got: String =
1255                 B(input).char_indices().map(|(_, _, ch)| ch).collect();
1256             assert_eq!(
1257                 expected, got,
1258                 "char_indices(ith: {:?}, given: {:?})",
1259                 i, input,
1260             );
1261 
1262             let expected: String = expected.chars().rev().collect();
1263 
1264             let got: String = B(input).chars().rev().collect();
1265             assert_eq!(
1266                 expected, got,
1267                 "chars.rev(ith: {:?}, given: {:?})",
1268                 i, input,
1269             );
1270             let got: String =
1271                 B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
1272             assert_eq!(
1273                 expected, got,
1274                 "char_indices.rev(ith: {:?}, given: {:?})",
1275                 i, input,
1276             );
1277         }
1278     }
1279 
1280     #[test]
utf8_chunks()1281     fn utf8_chunks() {
1282         let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
1283         assert_eq!(
1284             (c.next(), c.next()),
1285             (
1286                 Some(utf8::Utf8Chunk {
1287                     valid: "123",
1288                     invalid: b"\xC0".as_bstr(),
1289                     incomplete: false,
1290                 }),
1291                 None,
1292             )
1293         );
1294 
1295         let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
1296         assert_eq!(
1297             (c.next(), c.next(), c.next()),
1298             (
1299                 Some(utf8::Utf8Chunk {
1300                     valid: "123",
1301                     invalid: b"\xFF".as_bstr(),
1302                     incomplete: false,
1303                 }),
1304                 Some(utf8::Utf8Chunk {
1305                     valid: "",
1306                     invalid: b"\xFF".as_bstr(),
1307                     incomplete: false,
1308                 }),
1309                 None,
1310             )
1311         );
1312 
1313         let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
1314         assert_eq!(
1315             (c.next(), c.next()),
1316             (
1317                 Some(utf8::Utf8Chunk {
1318                     valid: "123",
1319                     invalid: b"\xD0".as_bstr(),
1320                     incomplete: true,
1321                 }),
1322                 None,
1323             )
1324         );
1325 
1326         let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
1327         assert_eq!(
1328             (c.next(), c.next(), c.next()),
1329             (
1330                 Some(utf8::Utf8Chunk {
1331                     valid: "123",
1332                     invalid: b"\xD0".as_bstr(),
1333                     incomplete: false,
1334                 }),
1335                 Some(utf8::Utf8Chunk {
1336                     valid: "456",
1337                     invalid: b"".as_bstr(),
1338                     incomplete: false,
1339                 }),
1340                 None,
1341             )
1342         );
1343 
1344         let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
1345         assert_eq!(
1346             (c.next(), c.next()),
1347             (
1348                 Some(utf8::Utf8Chunk {
1349                     valid: "123",
1350                     invalid: b"\xE2\x98".as_bstr(),
1351                     incomplete: true,
1352                 }),
1353                 None,
1354             )
1355         );
1356 
1357         let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
1358         assert_eq!(
1359             (c.next(), c.next()),
1360             (
1361                 Some(utf8::Utf8Chunk {
1362                     valid: "123",
1363                     invalid: b"\xF4\x8F\xBF".as_bstr(),
1364                     incomplete: true,
1365                 }),
1366                 None,
1367             )
1368         );
1369     }
1370 }
1371