1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #![allow(clippy::upper_case_acronyms)] 6 //! ULE implementation for the `char` type. 7 8 use super::*; 9 use crate::impl_ule_from_array; 10 use core::cmp::Ordering; 11 use core::convert::TryFrom; 12 13 /// A u8 array of little-endian data corresponding to a Unicode scalar value. 14 /// 15 /// The bytes of a `CharULE` are guaranteed to represent a little-endian-encoded u32 that is a 16 /// valid `char` and can be converted without validation. 17 /// 18 /// # Examples 19 /// 20 /// Convert a `char` to a `CharULE` and back again: 21 /// 22 /// ``` 23 /// use zerovec::ule::{AsULE, CharULE, ULE}; 24 /// 25 /// let c1 = ''; 26 /// let ule = c1.to_unaligned(); 27 /// assert_eq!(CharULE::slice_as_bytes(&[ule]), &[0x03, 0x11, 0x01]); 28 /// let c2 = char::from_unaligned(ule); 29 /// assert_eq!(c1, c2); 30 /// ``` 31 /// 32 /// Attempt to parse invalid bytes to a `CharULE`: 33 /// 34 /// ``` 35 /// use zerovec::ule::{CharULE, ULE}; 36 /// 37 /// let bytes: &[u8] = &[0xFF, 0xFF, 0xFF, 0xFF]; 38 /// CharULE::parse_bytes_to_slice(bytes).expect_err("Invalid bytes"); 39 /// ``` 40 #[repr(transparent)] 41 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] 42 pub struct CharULE([u8; 3]); 43 44 impl CharULE { 45 /// Converts a [`char`] to a [`CharULE`]. This is equivalent to calling 46 /// [`AsULE::to_unaligned()`] 47 /// 48 /// See the type-level documentation for [`CharULE`] for more information. 49 #[inline] from_aligned(c: char) -> Self50 pub const fn from_aligned(c: char) -> Self { 51 let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); 52 Self([u0, u1, u2]) 53 } 54 55 /// Converts this [`CharULE`] to a [`char`]. This is equivalent to calling 56 /// [`AsULE::from_unaligned`] 57 /// 58 /// See the type-level documentation for [`CharULE`] for more information. 59 #[inline] to_char(self) -> char60 pub fn to_char(self) -> char { 61 let [b0, b1, b2] = self.0; 62 // Safe because the bytes of CharULE are defined to represent a valid Unicode scalar value. 63 unsafe { char::from_u32_unchecked(u32::from_le_bytes([b0, b1, b2, 0])) } 64 } 65 66 impl_ule_from_array!(char, CharULE, Self([0; 3])); 67 } 68 69 // Safety (based on the safety checklist on the ULE trait): 70 // 1. CharULE does not include any uninitialized or padding bytes. 71 // (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) 72 // 2. CharULE is aligned to 1 byte. 73 // (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) 74 // 3. The impl of validate_bytes() returns an error if any byte is not valid. 75 // 4. The impl of validate_bytes() returns an error if there are extra bytes. 76 // 5. The other ULE methods use the default impl. 77 // 6. CharULE byte equality is semantic equality 78 unsafe impl ULE for CharULE { 79 #[inline] validate_bytes(bytes: &[u8]) -> Result<(), UleError>80 fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> { 81 if bytes.len() % 3 != 0 { 82 return Err(UleError::length::<Self>(bytes.len())); 83 } 84 // Validate the bytes 85 for chunk in bytes.chunks_exact(3) { 86 // TODO: Use slice::as_chunks() when stabilized 87 #[allow(clippy::indexing_slicing)] 88 // Won't panic because the chunks are always 3 bytes long 89 let u = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]); 90 char::try_from(u).map_err(|_| UleError::parse::<Self>())?; 91 } 92 Ok(()) 93 } 94 } 95 96 impl AsULE for char { 97 type ULE = CharULE; 98 99 #[inline] to_unaligned(self) -> Self::ULE100 fn to_unaligned(self) -> Self::ULE { 101 CharULE::from_aligned(self) 102 } 103 104 #[inline] from_unaligned(unaligned: Self::ULE) -> Self105 fn from_unaligned(unaligned: Self::ULE) -> Self { 106 unaligned.to_char() 107 } 108 } 109 110 impl PartialOrd for CharULE { partial_cmp(&self, other: &Self) -> Option<Ordering>111 fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 112 Some(self.cmp(other)) 113 } 114 } 115 116 impl Ord for CharULE { cmp(&self, other: &Self) -> Ordering117 fn cmp(&self, other: &Self) -> Ordering { 118 char::from_unaligned(*self).cmp(&char::from_unaligned(*other)) 119 } 120 } 121 122 #[cfg(test)] 123 mod test { 124 use super::*; 125 126 #[test] test_from_array()127 fn test_from_array() { 128 const CHARS: [char; 2] = ['a', '']; 129 const CHARS_ULE: [CharULE; 2] = CharULE::from_array(CHARS); 130 assert_eq!( 131 CharULE::slice_as_bytes(&CHARS_ULE), 132 &[0x61, 0x00, 0x00, 0x43, 0xF6, 0x01] 133 ); 134 } 135 136 #[test] test_from_array_zst()137 fn test_from_array_zst() { 138 const CHARS: [char; 0] = []; 139 const CHARS_ULE: [CharULE; 0] = CharULE::from_array(CHARS); 140 let bytes = CharULE::slice_as_bytes(&CHARS_ULE); 141 let empty: &[u8] = &[]; 142 assert_eq!(bytes, empty); 143 } 144 145 #[test] test_parse()146 fn test_parse() { 147 // 1-byte, 2-byte, 3-byte, and two 4-byte character in UTF-8 (not as relevant in UTF-32) 148 let chars = ['w', 'ω', '文', '', '']; 149 let char_ules: Vec<CharULE> = chars.iter().copied().map(char::to_unaligned).collect(); 150 let char_bytes: &[u8] = CharULE::slice_as_bytes(&char_ules); 151 152 // Check parsing 153 let parsed_ules: &[CharULE] = CharULE::parse_bytes_to_slice(char_bytes).unwrap(); 154 assert_eq!(char_ules, parsed_ules); 155 let parsed_chars: Vec<char> = parsed_ules 156 .iter() 157 .copied() 158 .map(char::from_unaligned) 159 .collect(); 160 assert_eq!(&chars, parsed_chars.as_slice()); 161 162 // Compare to golden expected data 163 assert_eq!( 164 &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], 165 char_bytes 166 ); 167 } 168 169 #[test] test_failures()170 fn test_failures() { 171 // 119 and 120 are valid, but not 0xD800 (high surrogate) 172 let u32s = [119, 0xD800, 120]; 173 let u32_ules: Vec<RawBytesULE<4>> = u32s 174 .iter() 175 .copied() 176 .map(<u32 as AsULE>::to_unaligned) 177 .collect(); 178 let u32_bytes: &[u8] = RawBytesULE::<4>::slice_as_bytes(&u32_ules); 179 let parsed_ules_result = CharULE::parse_bytes_to_slice(u32_bytes); 180 assert!(parsed_ules_result.is_err()); 181 182 // 0x20FFFF is out of range for a char 183 let u32s = [0x20FFFF]; 184 let u32_ules: Vec<RawBytesULE<4>> = u32s 185 .iter() 186 .copied() 187 .map(<u32 as AsULE>::to_unaligned) 188 .collect(); 189 let u32_bytes: &[u8] = RawBytesULE::<4>::slice_as_bytes(&u32_ules); 190 let parsed_ules_result = CharULE::parse_bytes_to_slice(u32_bytes); 191 assert!(parsed_ules_result.is_err()); 192 } 193 } 194