1 use std::iter::FusedIterator; 2 use std::iter::Peekable; 3 use std::mem; 4 5 use crate::util::is_continuation; 6 use crate::util::BYTE_SHIFT; 7 use crate::util::CONT_MASK; 8 9 use super::EncodingError; 10 use super::Result; 11 12 pub(in super::super) struct CodePoints<I> 13 where 14 I: Iterator<Item = u8>, 15 { 16 iter: Peekable<I>, 17 surrogate: bool, 18 still_utf8: bool, 19 } 20 21 impl<I> CodePoints<I> 22 where 23 I: Iterator<Item = u8>, 24 { new<S>(string: S) -> Self where S: IntoIterator<IntoIter = I>,25 pub(in super::super) fn new<S>(string: S) -> Self 26 where 27 S: IntoIterator<IntoIter = I>, 28 { 29 Self { 30 iter: string.into_iter().peekable(), 31 surrogate: false, 32 still_utf8: true, 33 } 34 } 35 is_still_utf8(&self) -> bool36 pub(super) fn is_still_utf8(&self) -> bool { 37 self.still_utf8 38 } 39 consume_next(&mut self, code_point: &mut u32) -> Result<()>40 fn consume_next(&mut self, code_point: &mut u32) -> Result<()> { 41 let &byte = self.iter.peek().ok_or(EncodingError::End())?; 42 43 if !is_continuation(byte) { 44 self.surrogate = false; 45 // Not consuming this byte will be useful if this crate ever offers 46 // a way to encode lossily. 47 return Err(EncodingError::Byte(byte)); 48 } 49 *code_point = 50 (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK); 51 52 let removed = self.iter.next(); 53 debug_assert_eq!(Some(byte), removed); 54 55 Ok(()) 56 } 57 inner_size_hint(&self) -> (usize, Option<usize>)58 pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) { 59 self.iter.size_hint() 60 } 61 } 62 63 impl<I> FusedIterator for CodePoints<I> where 64 I: FusedIterator + Iterator<Item = u8> 65 { 66 } 67 68 impl<I> Iterator for CodePoints<I> 69 where 70 I: Iterator<Item = u8>, 71 { 72 type Item = Result<u32>; 73 next(&mut self) -> Option<Self::Item>74 fn next(&mut self) -> Option<Self::Item> { 75 let byte = self.iter.next()?; 76 let mut code_point: u32 = byte.into(); 77 78 macro_rules! consume_next { 79 () => {{ 80 if let Err(error) = self.consume_next(&mut code_point) { 81 return Some(Err(error)); 82 } 83 }}; 84 } 85 86 let prev_surrogate = mem::replace(&mut self.surrogate, false); 87 88 let mut invalid = false; 89 if !byte.is_ascii() { 90 if byte < 0xC2 { 91 return Some(Err(EncodingError::Byte(byte))); 92 } 93 94 if byte < 0xE0 { 95 code_point &= 0x1F; 96 } else { 97 code_point &= 0x0F; 98 consume_next!(); 99 100 if byte >= 0xF0 { 101 if code_point.wrapping_sub(0x10) >= 0x100 { 102 invalid = true; 103 } 104 consume_next!(); 105 106 // This condition is optimized to detect surrogate code points. 107 } else if code_point & 0xFE0 == 0x360 { 108 self.still_utf8 = false; 109 if code_point & 0x10 == 0 { 110 self.surrogate = true; 111 } else if prev_surrogate { 112 // Decoding a broken surrogate pair would be lossy. 113 invalid = true; 114 } 115 } 116 117 if code_point < 0x20 { 118 invalid = true; 119 } 120 } 121 consume_next!(); 122 } 123 if invalid { 124 return Some(Err(EncodingError::CodePoint(code_point))); 125 } 126 127 Some(Ok(code_point)) 128 } 129 } 130