1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 use core::cmp::Ordering; 6 use core::fmt; 7 8 /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not 9 /// validated as such. 10 /// 11 /// Use this type instead of `char` when you want to deal with data that is expected to be valid 12 /// Unicode scalar values, but you want control over when or if you validate that assumption. 13 /// 14 /// # Examples 15 /// 16 /// ``` 17 /// use potential_utf::PotentialCodePoint; 18 /// 19 /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); 20 /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); 21 /// assert_eq!( 22 /// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), 23 /// Ok('') 24 /// ); 25 /// 26 /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); 27 /// assert_eq!( 28 /// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), 29 /// char::REPLACEMENT_CHARACTER 30 /// ); 31 /// ``` 32 #[repr(transparent)] 33 #[allow(clippy::exhaustive_structs)] // transparent newtype 34 #[derive(PartialEq, Eq, Clone, Copy, Hash)] 35 pub struct PotentialCodePoint([u8; 3]); 36 37 impl PotentialCodePoint { 38 /// Create a [`PotentialCodePoint`] from a `char`. 39 /// 40 /// # Examples 41 /// 42 /// ``` 43 /// use potential_utf::PotentialCodePoint; 44 /// 45 /// let a = PotentialCodePoint::from_char('a'); 46 /// assert_eq!(a.try_to_char().unwrap(), 'a'); 47 /// ``` 48 #[inline] from_char(c: char) -> Self49 pub const fn from_char(c: char) -> Self { 50 let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); 51 Self([u0, u1, u2]) 52 } 53 54 /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. 55 #[inline] from_u24(c: u32) -> Self56 pub const fn from_u24(c: u32) -> Self { 57 let [u0, u1, u2, _u3] = c.to_le_bytes(); 58 Self([u0, u1, u2]) 59 } 60 61 /// Attempt to convert a [`PotentialCodePoint`] to a `char`. 62 /// 63 /// # Examples 64 /// 65 /// ``` 66 /// use potential_utf::PotentialCodePoint; 67 /// use zerovec::ule::AsULE; 68 /// 69 /// let a = PotentialCodePoint::from_char('a'); 70 /// assert_eq!(a.try_to_char(), Ok('a')); 71 /// 72 /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); 73 /// assert!(matches!(b.try_to_char(), Err(_))); 74 /// ``` 75 #[inline] try_to_char(self) -> Result<char, core::char::CharTryFromError>76 pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> { 77 char::try_from(u32::from(self)) 78 } 79 80 /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] 81 /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. 82 /// 83 /// # Examples 84 /// 85 /// ``` 86 /// use potential_utf::PotentialCodePoint; 87 /// use zerovec::ule::AsULE; 88 /// 89 /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); 90 /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); 91 /// ``` 92 #[inline] to_char_lossy(self) -> char93 pub fn to_char_lossy(self) -> char { 94 self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) 95 } 96 97 /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is 98 /// a valid Unicode scalar value. 99 /// 100 /// # Safety 101 /// 102 /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. 103 /// 104 /// # Examples 105 /// 106 /// ``` 107 /// use potential_utf::PotentialCodePoint; 108 /// 109 /// let a = PotentialCodePoint::from_char('a'); 110 /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); 111 /// ``` 112 #[inline] to_char_unchecked(self) -> char113 pub unsafe fn to_char_unchecked(self) -> char { 114 char::from_u32_unchecked(u32::from(self)) 115 } 116 117 /// For converting to the ULE type in a const context 118 /// 119 /// Can be removed once const traits are a thing 120 #[inline] 121 #[cfg(feature = "zerovec")] to_unaligned(self) -> zerovec::ule::RawBytesULE<3>122 pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { 123 zerovec::ule::RawBytesULE(self.0) 124 } 125 } 126 127 /// This impl requires enabling the optional `zerovec` Cargo feature 128 #[cfg(feature = "zerovec")] 129 impl zerovec::ule::AsULE for PotentialCodePoint { 130 type ULE = zerovec::ule::RawBytesULE<3>; 131 132 #[inline] to_unaligned(self) -> Self::ULE133 fn to_unaligned(self) -> Self::ULE { 134 zerovec::ule::RawBytesULE(self.0) 135 } 136 137 #[inline] from_unaligned(unaligned: Self::ULE) -> Self138 fn from_unaligned(unaligned: Self::ULE) -> Self { 139 Self(unaligned.0) 140 } 141 } 142 143 // Safety: PotentialCodePoint is always the little-endian representation of a char, 144 // which corresponds to its AsULE::ULE type 145 /// This impl requires enabling the optional `zerovec` Cargo feature 146 #[cfg(feature = "zerovec")] 147 unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} 148 149 impl fmt::Debug for PotentialCodePoint { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result150 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 151 // Debug as a char if possible 152 match self.try_to_char() { 153 Ok(c) => fmt::Debug::fmt(&c, f), 154 Err(_) => fmt::Debug::fmt(&self.0, f), 155 } 156 } 157 } 158 159 impl PartialOrd for PotentialCodePoint { partial_cmp(&self, other: &Self) -> Option<Ordering>160 fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 161 Some(self.cmp(other)) 162 } 163 } 164 165 impl PartialEq<char> for PotentialCodePoint { eq(&self, other: &char) -> bool166 fn eq(&self, other: &char) -> bool { 167 self.eq(&Self::from_char(*other)) 168 } 169 } 170 171 impl PartialOrd<char> for PotentialCodePoint { partial_cmp(&self, other: &char) -> Option<Ordering>172 fn partial_cmp(&self, other: &char) -> Option<Ordering> { 173 self.partial_cmp(&Self::from_char(*other)) 174 } 175 } 176 177 impl PartialEq<PotentialCodePoint> for char { eq(&self, other: &PotentialCodePoint) -> bool178 fn eq(&self, other: &PotentialCodePoint) -> bool { 179 PotentialCodePoint::from_char(*self).eq(other) 180 } 181 } 182 183 impl PartialOrd<PotentialCodePoint> for char { partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering>184 fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> { 185 PotentialCodePoint::from_char(*self).partial_cmp(other) 186 } 187 } 188 189 impl Ord for PotentialCodePoint { 190 // custom implementation, as derived Ord would compare lexicographically cmp(&self, other: &Self) -> Ordering191 fn cmp(&self, other: &Self) -> Ordering { 192 let a = u32::from(*self); 193 let b = u32::from(*other); 194 a.cmp(&b) 195 } 196 } 197 198 impl From<PotentialCodePoint> for u32 { from(x: PotentialCodePoint) -> Self199 fn from(x: PotentialCodePoint) -> Self { 200 let [a0, a1, a2] = x.0; 201 u32::from_le_bytes([a0, a1, a2, 0]) 202 } 203 } 204 205 impl TryFrom<u32> for PotentialCodePoint { 206 type Error = (); try_from(x: u32) -> Result<Self, ()>207 fn try_from(x: u32) -> Result<Self, ()> { 208 let [u0, u1, u2, u3] = x.to_le_bytes(); 209 if u3 != 0 { 210 return Err(()); 211 } 212 Ok(Self([u0, u1, u2])) 213 } 214 } 215 216 impl From<char> for PotentialCodePoint { 217 #[inline] from(value: char) -> Self218 fn from(value: char) -> Self { 219 Self::from_char(value) 220 } 221 } 222 223 impl TryFrom<PotentialCodePoint> for char { 224 type Error = core::char::CharTryFromError; 225 226 #[inline] try_from(value: PotentialCodePoint) -> Result<char, Self::Error>227 fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> { 228 value.try_to_char() 229 } 230 } 231 232 /// This impl requires enabling the optional `serde` Cargo feature 233 #[cfg(feature = "serde")] 234 impl serde::Serialize for PotentialCodePoint { serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,235 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> 236 where 237 S: serde::Serializer, 238 { 239 use serde::ser::Error; 240 let c = self 241 .try_to_char() 242 .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; 243 if serializer.is_human_readable() { 244 serializer.serialize_char(c) 245 } else { 246 self.0.serialize(serializer) 247 } 248 } 249 } 250 251 /// This impl requires enabling the optional `serde` Cargo feature 252 #[cfg(feature = "serde")] 253 impl<'de> serde::Deserialize<'de> for PotentialCodePoint { deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,254 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> 255 where 256 D: serde::Deserializer<'de>, 257 { 258 if deserializer.is_human_readable() { 259 let c = <char>::deserialize(deserializer)?; 260 Ok(PotentialCodePoint::from_char(c)) 261 } else { 262 let bytes = <[u8; 3]>::deserialize(deserializer)?; 263 Ok(PotentialCodePoint(bytes)) 264 } 265 } 266 } 267 268 /// This impl requires enabling the optional `databake` Cargo feature 269 #[cfg(feature = "databake")] 270 impl databake::Bake for PotentialCodePoint { bake(&self, env: &databake::CrateEnv) -> databake::TokenStream271 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { 272 match self.try_to_char() { 273 Ok(ch) => { 274 env.insert("potential_utf"); 275 let ch = ch.bake(env); 276 databake::quote! { 277 potential_utf::PotentialCodePoint::from_char(#ch) 278 } 279 } 280 Err(_) => { 281 env.insert("potential_utf"); 282 let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); 283 databake::quote! { 284 potential_utf::PotentialCodePoint::from_u24(#u24) 285 } 286 } 287 } 288 } 289 } 290 291 #[cfg(test)] 292 mod test { 293 use super::*; 294 use zerovec::ZeroVec; 295 296 #[test] test_serde_fail()297 fn test_serde_fail() { 298 let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); 299 serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); 300 bincode::serialize(&uc).expect_err("serialize invalid char bytes"); 301 } 302 303 #[test] test_serde_json()304 fn test_serde_json() { 305 let c = ''; 306 let uc = PotentialCodePoint::from_char(c); 307 let json_ser = serde_json::to_string(&uc).unwrap(); 308 309 assert_eq!(json_ser, r#""""#); 310 311 let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); 312 313 assert_eq!(uc, json_de); 314 } 315 316 #[test] test_serde_bincode()317 fn test_serde_bincode() { 318 let c = ''; 319 let uc = PotentialCodePoint::from_char(c); 320 let bytes_ser = bincode::serialize(&uc).unwrap(); 321 322 assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); 323 324 let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); 325 326 assert_eq!(uc, bytes_de); 327 } 328 329 #[test] test_representation()330 fn test_representation() { 331 let chars = ['w', 'ω', '文', '', '']; 332 333 // backed by [PotentialCodePoint] 334 let uvchars: Vec<_> = chars 335 .iter() 336 .copied() 337 .map(PotentialCodePoint::from_char) 338 .collect(); 339 // backed by [RawBytesULE<3>] 340 let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); 341 342 let ule_bytes = zvec.as_bytes(); 343 let uvbytes; 344 unsafe { 345 let ptr = &uvchars[..] as *const _ as *const u8; 346 uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); 347 } 348 349 // PotentialCodePoint is defined as little-endian, so this must be true on all platforms 350 // also asserts that to_unaligned/from_unaligned are no-ops 351 assert_eq!(uvbytes, ule_bytes); 352 353 assert_eq!( 354 &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], 355 ule_bytes 356 ); 357 } 358 359 #[test] test_char_bake()360 fn test_char_bake() { 361 databake::test_bake!( 362 PotentialCodePoint, 363 const, 364 crate::PotentialCodePoint::from_char('b'), 365 potential_utf 366 ); 367 // surrogate code point 368 databake::test_bake!( 369 PotentialCodePoint, 370 const, 371 crate::PotentialCodePoint::from_u24(55296u32), 372 potential_utf 373 ); 374 } 375 } 376