• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::iter::FusedIterator;
2 use std::iter::Peekable;
3 use std::mem;
4 
5 use crate::util::is_continuation;
6 use crate::util::BYTE_SHIFT;
7 use crate::util::CONT_MASK;
8 
9 use super::EncodingError;
10 use super::Result;
11 
12 pub(in super::super) struct CodePoints<I>
13 where
14     I: Iterator<Item = u8>,
15 {
16     iter: Peekable<I>,
17     surrogate: bool,
18     still_utf8: bool,
19 }
20 
21 impl<I> CodePoints<I>
22 where
23     I: Iterator<Item = u8>,
24 {
new<S>(string: S) -> Self where S: IntoIterator<IntoIter = I>,25     pub(in super::super) fn new<S>(string: S) -> Self
26     where
27         S: IntoIterator<IntoIter = I>,
28     {
29         Self {
30             iter: string.into_iter().peekable(),
31             surrogate: false,
32             still_utf8: true,
33         }
34     }
35 
is_still_utf8(&self) -> bool36     pub(super) fn is_still_utf8(&self) -> bool {
37         self.still_utf8
38     }
39 
consume_next(&mut self, code_point: &mut u32) -> Result<()>40     fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
41         let &byte = self.iter.peek().ok_or(EncodingError::End())?;
42 
43         if !is_continuation(byte) {
44             self.surrogate = false;
45             // Not consuming this byte will be useful if this crate ever offers
46             // a way to encode lossily.
47             return Err(EncodingError::Byte(byte));
48         }
49         *code_point =
50             (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);
51 
52         let removed = self.iter.next();
53         debug_assert_eq!(Some(byte), removed);
54 
55         Ok(())
56     }
57 
inner_size_hint(&self) -> (usize, Option<usize>)58     pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
59         self.iter.size_hint()
60     }
61 }
62 
63 impl<I> FusedIterator for CodePoints<I> where
64     I: FusedIterator + Iterator<Item = u8>
65 {
66 }
67 
68 impl<I> Iterator for CodePoints<I>
69 where
70     I: Iterator<Item = u8>,
71 {
72     type Item = Result<u32>;
73 
next(&mut self) -> Option<Self::Item>74     fn next(&mut self) -> Option<Self::Item> {
75         let byte = self.iter.next()?;
76         let mut code_point: u32 = byte.into();
77 
78         macro_rules! consume_next {
79             () => {{
80                 if let Err(error) = self.consume_next(&mut code_point) {
81                     return Some(Err(error));
82                 }
83             }};
84         }
85 
86         let prev_surrogate = mem::replace(&mut self.surrogate, false);
87 
88         let mut invalid = false;
89         if !byte.is_ascii() {
90             if byte < 0xC2 {
91                 return Some(Err(EncodingError::Byte(byte)));
92             }
93 
94             if byte < 0xE0 {
95                 code_point &= 0x1F;
96             } else {
97                 code_point &= 0x0F;
98                 consume_next!();
99 
100                 if byte >= 0xF0 {
101                     if code_point.wrapping_sub(0x10) >= 0x100 {
102                         invalid = true;
103                     }
104                     consume_next!();
105 
106                 // This condition is optimized to detect surrogate code points.
107                 } else if code_point & 0xFE0 == 0x360 {
108                     self.still_utf8 = false;
109                     if code_point & 0x10 == 0 {
110                         self.surrogate = true;
111                     } else if prev_surrogate {
112                         // Decoding a broken surrogate pair would be lossy.
113                         invalid = true;
114                     }
115                 }
116 
117                 if code_point < 0x20 {
118                     invalid = true;
119                 }
120             }
121             consume_next!();
122         }
123         if invalid {
124             return Some(Err(EncodingError::CodePoint(code_point)));
125         }
126 
127         Some(Ok(code_point))
128     }
129 }
130