• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use core::cmp::Ordering;
6 use core::fmt;
7 
8 /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not
9 /// validated as such.
10 ///
11 /// Use this type instead of `char` when you want to deal with data that is expected to be valid
12 /// Unicode scalar values, but you want control over when or if you validate that assumption.
13 ///
14 /// # Examples
15 ///
16 /// ```
17 /// use potential_utf::PotentialCodePoint;
18 ///
19 /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h'));
20 /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i'));
21 /// assert_eq!(
22 ///     PotentialCodePoint::from_u24(0x1F44B).try_to_char(),
23 ///     Ok('��')
24 /// );
25 ///
26 /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err());
27 /// assert_eq!(
28 ///     PotentialCodePoint::from_u24(0xDE01).to_char_lossy(),
29 ///     char::REPLACEMENT_CHARACTER
30 /// );
31 /// ```
32 #[repr(transparent)]
33 #[allow(clippy::exhaustive_structs)] // transparent newtype
34 #[derive(PartialEq, Eq, Clone, Copy, Hash)]
35 pub struct PotentialCodePoint([u8; 3]);
36 
37 impl PotentialCodePoint {
38     /// Create a [`PotentialCodePoint`] from a `char`.
39     ///
40     /// # Examples
41     ///
42     /// ```
43     /// use potential_utf::PotentialCodePoint;
44     ///
45     /// let a = PotentialCodePoint::from_char('a');
46     /// assert_eq!(a.try_to_char().unwrap(), 'a');
47     /// ```
48     #[inline]
from_char(c: char) -> Self49     pub const fn from_char(c: char) -> Self {
50         let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
51         Self([u0, u1, u2])
52     }
53 
54     /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits.
55     #[inline]
from_u24(c: u32) -> Self56     pub const fn from_u24(c: u32) -> Self {
57         let [u0, u1, u2, _u3] = c.to_le_bytes();
58         Self([u0, u1, u2])
59     }
60 
61     /// Attempt to convert a [`PotentialCodePoint`] to a `char`.
62     ///
63     /// # Examples
64     ///
65     /// ```
66     /// use potential_utf::PotentialCodePoint;
67     /// use zerovec::ule::AsULE;
68     ///
69     /// let a = PotentialCodePoint::from_char('a');
70     /// assert_eq!(a.try_to_char(), Ok('a'));
71     ///
72     /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
73     /// assert!(matches!(b.try_to_char(), Err(_)));
74     /// ```
75     #[inline]
try_to_char(self) -> Result<char, core::char::CharTryFromError>76     pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
77         char::try_from(u32::from(self))
78     }
79 
80     /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
81     /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value.
82     ///
83     /// # Examples
84     ///
85     /// ```
86     /// use potential_utf::PotentialCodePoint;
87     /// use zerovec::ule::AsULE;
88     ///
89     /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
90     /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
91     /// ```
92     #[inline]
to_char_lossy(self) -> char93     pub fn to_char_lossy(self) -> char {
94         self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
95     }
96 
97     /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is
98     /// a valid Unicode scalar value.
99     ///
100     /// # Safety
101     ///
102     /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order.
103     ///
104     /// # Examples
105     ///
106     /// ```
107     /// use potential_utf::PotentialCodePoint;
108     ///
109     /// let a = PotentialCodePoint::from_char('a');
110     /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
111     /// ```
112     #[inline]
to_char_unchecked(self) -> char113     pub unsafe fn to_char_unchecked(self) -> char {
114         char::from_u32_unchecked(u32::from(self))
115     }
116 
117     /// For converting to the ULE type in a const context
118     ///
119     /// Can be removed once const traits are a thing
120     #[inline]
121     #[cfg(feature = "zerovec")]
to_unaligned(self) -> zerovec::ule::RawBytesULE<3>122     pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> {
123         zerovec::ule::RawBytesULE(self.0)
124     }
125 }
126 
127 /// This impl requires enabling the optional `zerovec` Cargo feature
128 #[cfg(feature = "zerovec")]
129 impl zerovec::ule::AsULE for PotentialCodePoint {
130     type ULE = zerovec::ule::RawBytesULE<3>;
131 
132     #[inline]
to_unaligned(self) -> Self::ULE133     fn to_unaligned(self) -> Self::ULE {
134         zerovec::ule::RawBytesULE(self.0)
135     }
136 
137     #[inline]
from_unaligned(unaligned: Self::ULE) -> Self138     fn from_unaligned(unaligned: Self::ULE) -> Self {
139         Self(unaligned.0)
140     }
141 }
142 
143 // Safety: PotentialCodePoint is always the little-endian representation of a char,
144 // which corresponds to its AsULE::ULE type
145 /// This impl requires enabling the optional `zerovec` Cargo feature
146 #[cfg(feature = "zerovec")]
147 unsafe impl zerovec::ule::EqULE for PotentialCodePoint {}
148 
149 impl fmt::Debug for PotentialCodePoint {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result150     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
151         // Debug as a char if possible
152         match self.try_to_char() {
153             Ok(c) => fmt::Debug::fmt(&c, f),
154             Err(_) => fmt::Debug::fmt(&self.0, f),
155         }
156     }
157 }
158 
159 impl PartialOrd for PotentialCodePoint {
partial_cmp(&self, other: &Self) -> Option<Ordering>160     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
161         Some(self.cmp(other))
162     }
163 }
164 
165 impl PartialEq<char> for PotentialCodePoint {
eq(&self, other: &char) -> bool166     fn eq(&self, other: &char) -> bool {
167         self.eq(&Self::from_char(*other))
168     }
169 }
170 
171 impl PartialOrd<char> for PotentialCodePoint {
partial_cmp(&self, other: &char) -> Option<Ordering>172     fn partial_cmp(&self, other: &char) -> Option<Ordering> {
173         self.partial_cmp(&Self::from_char(*other))
174     }
175 }
176 
177 impl PartialEq<PotentialCodePoint> for char {
eq(&self, other: &PotentialCodePoint) -> bool178     fn eq(&self, other: &PotentialCodePoint) -> bool {
179         PotentialCodePoint::from_char(*self).eq(other)
180     }
181 }
182 
183 impl PartialOrd<PotentialCodePoint> for char {
partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering>184     fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> {
185         PotentialCodePoint::from_char(*self).partial_cmp(other)
186     }
187 }
188 
189 impl Ord for PotentialCodePoint {
190     // custom implementation, as derived Ord would compare lexicographically
cmp(&self, other: &Self) -> Ordering191     fn cmp(&self, other: &Self) -> Ordering {
192         let a = u32::from(*self);
193         let b = u32::from(*other);
194         a.cmp(&b)
195     }
196 }
197 
198 impl From<PotentialCodePoint> for u32 {
from(x: PotentialCodePoint) -> Self199     fn from(x: PotentialCodePoint) -> Self {
200         let [a0, a1, a2] = x.0;
201         u32::from_le_bytes([a0, a1, a2, 0])
202     }
203 }
204 
205 impl TryFrom<u32> for PotentialCodePoint {
206     type Error = ();
try_from(x: u32) -> Result<Self, ()>207     fn try_from(x: u32) -> Result<Self, ()> {
208         let [u0, u1, u2, u3] = x.to_le_bytes();
209         if u3 != 0 {
210             return Err(());
211         }
212         Ok(Self([u0, u1, u2]))
213     }
214 }
215 
216 impl From<char> for PotentialCodePoint {
217     #[inline]
from(value: char) -> Self218     fn from(value: char) -> Self {
219         Self::from_char(value)
220     }
221 }
222 
223 impl TryFrom<PotentialCodePoint> for char {
224     type Error = core::char::CharTryFromError;
225 
226     #[inline]
try_from(value: PotentialCodePoint) -> Result<char, Self::Error>227     fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> {
228         value.try_to_char()
229     }
230 }
231 
232 /// This impl requires enabling the optional `serde` Cargo feature
233 #[cfg(feature = "serde")]
234 impl serde::Serialize for PotentialCodePoint {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,235     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
236     where
237         S: serde::Serializer,
238     {
239         use serde::ser::Error;
240         let c = self
241             .try_to_char()
242             .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?;
243         if serializer.is_human_readable() {
244             serializer.serialize_char(c)
245         } else {
246             self.0.serialize(serializer)
247         }
248     }
249 }
250 
251 /// This impl requires enabling the optional `serde` Cargo feature
252 #[cfg(feature = "serde")]
253 impl<'de> serde::Deserialize<'de> for PotentialCodePoint {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,254     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
255     where
256         D: serde::Deserializer<'de>,
257     {
258         if deserializer.is_human_readable() {
259             let c = <char>::deserialize(deserializer)?;
260             Ok(PotentialCodePoint::from_char(c))
261         } else {
262             let bytes = <[u8; 3]>::deserialize(deserializer)?;
263             Ok(PotentialCodePoint(bytes))
264         }
265     }
266 }
267 
268 /// This impl requires enabling the optional `databake` Cargo feature
269 #[cfg(feature = "databake")]
270 impl databake::Bake for PotentialCodePoint {
bake(&self, env: &databake::CrateEnv) -> databake::TokenStream271     fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
272         match self.try_to_char() {
273             Ok(ch) => {
274                 env.insert("potential_utf");
275                 let ch = ch.bake(env);
276                 databake::quote! {
277                     potential_utf::PotentialCodePoint::from_char(#ch)
278                 }
279             }
280             Err(_) => {
281                 env.insert("potential_utf");
282                 let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
283                 databake::quote! {
284                     potential_utf::PotentialCodePoint::from_u24(#u24)
285                 }
286             }
287         }
288     }
289 }
290 
291 #[cfg(test)]
292 mod test {
293     use super::*;
294     use zerovec::ZeroVec;
295 
296     #[test]
test_serde_fail()297     fn test_serde_fail() {
298         let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]);
299         serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
300         bincode::serialize(&uc).expect_err("serialize invalid char bytes");
301     }
302 
303     #[test]
test_serde_json()304     fn test_serde_json() {
305         let c = '��';
306         let uc = PotentialCodePoint::from_char(c);
307         let json_ser = serde_json::to_string(&uc).unwrap();
308 
309         assert_eq!(json_ser, r#""��""#);
310 
311         let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap();
312 
313         assert_eq!(uc, json_de);
314     }
315 
316     #[test]
test_serde_bincode()317     fn test_serde_bincode() {
318         let c = '��';
319         let uc = PotentialCodePoint::from_char(c);
320         let bytes_ser = bincode::serialize(&uc).unwrap();
321 
322         assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
323 
324         let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap();
325 
326         assert_eq!(uc, bytes_de);
327     }
328 
329     #[test]
test_representation()330     fn test_representation() {
331         let chars = ['w', 'ω', '文', '��', '��'];
332 
333         // backed by [PotentialCodePoint]
334         let uvchars: Vec<_> = chars
335             .iter()
336             .copied()
337             .map(PotentialCodePoint::from_char)
338             .collect();
339         // backed by [RawBytesULE<3>]
340         let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
341 
342         let ule_bytes = zvec.as_bytes();
343         let uvbytes;
344         unsafe {
345             let ptr = &uvchars[..] as *const _ as *const u8;
346             uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
347         }
348 
349         // PotentialCodePoint is defined as little-endian, so this must be true on all platforms
350         // also asserts that to_unaligned/from_unaligned are no-ops
351         assert_eq!(uvbytes, ule_bytes);
352 
353         assert_eq!(
354             &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
355             ule_bytes
356         );
357     }
358 
359     #[test]
test_char_bake()360     fn test_char_bake() {
361         databake::test_bake!(
362             PotentialCodePoint,
363             const,
364             crate::PotentialCodePoint::from_char('b'),
365             potential_utf
366         );
367         // surrogate code point
368         databake::test_bake!(
369             PotentialCodePoint,
370             const,
371             crate::PotentialCodePoint::from_u24(55296u32),
372             potential_utf
373         );
374     }
375 }
376