• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 #![allow(clippy::upper_case_acronyms)]
6 //! ULE implementation for the `char` type.
7 
8 use super::*;
9 use crate::impl_ule_from_array;
10 use core::cmp::Ordering;
11 use core::convert::TryFrom;
12 
13 /// A u8 array of little-endian data corresponding to a Unicode scalar value.
14 ///
15 /// The bytes of a `CharULE` are guaranteed to represent a little-endian-encoded u32 that is a
16 /// valid `char` and can be converted without validation.
17 ///
18 /// # Examples
19 ///
20 /// Convert a `char` to a `CharULE` and back again:
21 ///
22 /// ```
23 /// use zerovec::ule::{AsULE, CharULE, ULE};
24 ///
25 /// let c1 = '��';
26 /// let ule = c1.to_unaligned();
27 /// assert_eq!(CharULE::slice_as_bytes(&[ule]), &[0x03, 0x11, 0x01]);
28 /// let c2 = char::from_unaligned(ule);
29 /// assert_eq!(c1, c2);
30 /// ```
31 ///
32 /// Attempt to parse invalid bytes to a `CharULE`:
33 ///
34 /// ```
35 /// use zerovec::ule::{CharULE, ULE};
36 ///
37 /// let bytes: &[u8] = &[0xFF, 0xFF, 0xFF, 0xFF];
38 /// CharULE::parse_bytes_to_slice(bytes).expect_err("Invalid bytes");
39 /// ```
40 #[repr(transparent)]
41 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
42 pub struct CharULE([u8; 3]);
43 
44 impl CharULE {
45     /// Converts a [`char`] to a [`CharULE`]. This is equivalent to calling
46     /// [`AsULE::to_unaligned()`]
47     ///
48     /// See the type-level documentation for [`CharULE`] for more information.
49     #[inline]
from_aligned(c: char) -> Self50     pub const fn from_aligned(c: char) -> Self {
51         let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
52         Self([u0, u1, u2])
53     }
54 
55     /// Converts this [`CharULE`] to a [`char`]. This is equivalent to calling
56     /// [`AsULE::from_unaligned`]
57     ///
58     /// See the type-level documentation for [`CharULE`] for more information.
59     #[inline]
to_char(self) -> char60     pub fn to_char(self) -> char {
61         let [b0, b1, b2] = self.0;
62         // Safe because the bytes of CharULE are defined to represent a valid Unicode scalar value.
63         unsafe { char::from_u32_unchecked(u32::from_le_bytes([b0, b1, b2, 0])) }
64     }
65 
66     impl_ule_from_array!(char, CharULE, Self([0; 3]));
67 }
68 
69 // Safety (based on the safety checklist on the ULE trait):
70 //  1. CharULE does not include any uninitialized or padding bytes.
71 //     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
72 //  2. CharULE is aligned to 1 byte.
73 //     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
74 //  3. The impl of validate_bytes() returns an error if any byte is not valid.
75 //  4. The impl of validate_bytes() returns an error if there are extra bytes.
76 //  5. The other ULE methods use the default impl.
77 //  6. CharULE byte equality is semantic equality
78 unsafe impl ULE for CharULE {
79     #[inline]
validate_bytes(bytes: &[u8]) -> Result<(), UleError>80     fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
81         if bytes.len() % 3 != 0 {
82             return Err(UleError::length::<Self>(bytes.len()));
83         }
84         // Validate the bytes
85         for chunk in bytes.chunks_exact(3) {
86             // TODO: Use slice::as_chunks() when stabilized
87             #[allow(clippy::indexing_slicing)]
88             // Won't panic because the chunks are always 3 bytes long
89             let u = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]);
90             char::try_from(u).map_err(|_| UleError::parse::<Self>())?;
91         }
92         Ok(())
93     }
94 }
95 
96 impl AsULE for char {
97     type ULE = CharULE;
98 
99     #[inline]
to_unaligned(self) -> Self::ULE100     fn to_unaligned(self) -> Self::ULE {
101         CharULE::from_aligned(self)
102     }
103 
104     #[inline]
from_unaligned(unaligned: Self::ULE) -> Self105     fn from_unaligned(unaligned: Self::ULE) -> Self {
106         unaligned.to_char()
107     }
108 }
109 
110 impl PartialOrd for CharULE {
partial_cmp(&self, other: &Self) -> Option<Ordering>111     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
112         Some(self.cmp(other))
113     }
114 }
115 
116 impl Ord for CharULE {
cmp(&self, other: &Self) -> Ordering117     fn cmp(&self, other: &Self) -> Ordering {
118         char::from_unaligned(*self).cmp(&char::from_unaligned(*other))
119     }
120 }
121 
122 #[cfg(test)]
123 mod test {
124     use super::*;
125 
126     #[test]
test_from_array()127     fn test_from_array() {
128         const CHARS: [char; 2] = ['a', '��'];
129         const CHARS_ULE: [CharULE; 2] = CharULE::from_array(CHARS);
130         assert_eq!(
131             CharULE::slice_as_bytes(&CHARS_ULE),
132             &[0x61, 0x00, 0x00, 0x43, 0xF6, 0x01]
133         );
134     }
135 
136     #[test]
test_from_array_zst()137     fn test_from_array_zst() {
138         const CHARS: [char; 0] = [];
139         const CHARS_ULE: [CharULE; 0] = CharULE::from_array(CHARS);
140         let bytes = CharULE::slice_as_bytes(&CHARS_ULE);
141         let empty: &[u8] = &[];
142         assert_eq!(bytes, empty);
143     }
144 
145     #[test]
test_parse()146     fn test_parse() {
147         // 1-byte, 2-byte, 3-byte, and two 4-byte character in UTF-8 (not as relevant in UTF-32)
148         let chars = ['w', 'ω', '文', '��', '��'];
149         let char_ules: Vec<CharULE> = chars.iter().copied().map(char::to_unaligned).collect();
150         let char_bytes: &[u8] = CharULE::slice_as_bytes(&char_ules);
151 
152         // Check parsing
153         let parsed_ules: &[CharULE] = CharULE::parse_bytes_to_slice(char_bytes).unwrap();
154         assert_eq!(char_ules, parsed_ules);
155         let parsed_chars: Vec<char> = parsed_ules
156             .iter()
157             .copied()
158             .map(char::from_unaligned)
159             .collect();
160         assert_eq!(&chars, parsed_chars.as_slice());
161 
162         // Compare to golden expected data
163         assert_eq!(
164             &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
165             char_bytes
166         );
167     }
168 
169     #[test]
test_failures()170     fn test_failures() {
171         // 119 and 120 are valid, but not 0xD800 (high surrogate)
172         let u32s = [119, 0xD800, 120];
173         let u32_ules: Vec<RawBytesULE<4>> = u32s
174             .iter()
175             .copied()
176             .map(<u32 as AsULE>::to_unaligned)
177             .collect();
178         let u32_bytes: &[u8] = RawBytesULE::<4>::slice_as_bytes(&u32_ules);
179         let parsed_ules_result = CharULE::parse_bytes_to_slice(u32_bytes);
180         assert!(parsed_ules_result.is_err());
181 
182         // 0x20FFFF is out of range for a char
183         let u32s = [0x20FFFF];
184         let u32_ules: Vec<RawBytesULE<4>> = u32s
185             .iter()
186             .copied()
187             .map(<u32 as AsULE>::to_unaligned)
188             .collect();
189         let u32_bytes: &[u8] = RawBytesULE::<4>::slice_as_bytes(&u32_ules);
190         let parsed_ules_result = CharULE::parse_bytes_to_slice(u32_bytes);
191         assert!(parsed_ules_result.is_err());
192     }
193 }
194