• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use crate::asciibyte::AsciiByte;
6 
7 /// Internal helper struct that performs operations on aligned integers.
8 /// Supports strings up to 4 bytes long.
9 #[repr(transparent)]
10 pub struct Aligned4(u32);
11 
12 impl Aligned4 {
13     /// # Panics
14     /// Panics if N is greater than 4
15     #[inline]
from_utf8<const N: usize>(src: &[u8; N]) -> Self16     pub const fn from_utf8<const N: usize>(src: &[u8; N]) -> Self {
17         let mut bytes = [0; 4];
18         let mut i = 0;
19         // The function documentation defines when panics may occur
20         #[allow(clippy::indexing_slicing)]
21         while i < N {
22             bytes[i] = src[i];
23             i += 1;
24         }
25         Self(u32::from_ne_bytes(bytes))
26     }
27 
28     #[inline]
from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self29     pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
30         Self::from_utf8::<N>(unsafe { core::mem::transmute::<&[AsciiByte; N], &[u8; N]>(src) })
31     }
32 
33     #[inline]
to_bytes(&self) -> [u8; 4]34     pub const fn to_bytes(&self) -> [u8; 4] {
35         self.0.to_ne_bytes()
36     }
37 
38     #[inline]
to_ascii_bytes(&self) -> [AsciiByte; 4]39     pub const fn to_ascii_bytes(&self) -> [AsciiByte; 4] {
40         unsafe { core::mem::transmute(self.to_bytes()) }
41     }
42 
len(&self) -> usize43     pub const fn len(&self) -> usize {
44         let word = self.0;
45         #[cfg(target_endian = "little")]
46         let len = (4 - word.leading_zeros() / 8) as usize;
47         #[cfg(target_endian = "big")]
48         let len = (4 - word.trailing_zeros() / 8) as usize;
49         len
50     }
51 
is_ascii_alphabetic(&self) -> bool52     pub const fn is_ascii_alphabetic(&self) -> bool {
53         let word = self.0;
54         // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid.
55         // `mask` sets all NUL bytes to 0.
56         let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
57         // `lower` converts the string to lowercase. It may also change the value of non-alpha
58         // characters, but this does not matter for the alphabetic test that follows.
59         let lower = word | 0x2020_2020;
60         // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters.
61         let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
62         // The overall string is valid if every character passes at least one test.
63         // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`).
64         (alpha & mask) == 0
65     }
66 
is_ascii_alphanumeric(&self) -> bool67     pub const fn is_ascii_alphanumeric(&self) -> bool {
68         let word = self.0;
69         // See explanatory comments in is_ascii_alphabetic
70         let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
71         let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
72         let lower = word | 0x2020_2020;
73         let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
74         (alpha & numeric & mask) == 0
75     }
76 
is_ascii_numeric(&self) -> bool77     pub const fn is_ascii_numeric(&self) -> bool {
78         let word = self.0;
79         // See explanatory comments in is_ascii_alphabetic
80         let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
81         let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
82         (numeric & mask) == 0
83     }
84 
is_ascii_lowercase(&self) -> bool85     pub const fn is_ascii_lowercase(&self) -> bool {
86         let word = self.0;
87         // For efficiency, this function tests for an invalid string rather than a valid string.
88         // A string is ASCII lowercase iff it contains no uppercase ASCII characters.
89         // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1.
90         let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
91         // The string is valid if it contains no invalid characters (if all high bits are 1).
92         (invalid_case & 0x8080_8080) == 0x8080_8080
93     }
94 
is_ascii_titlecase(&self) -> bool95     pub const fn is_ascii_titlecase(&self) -> bool {
96         let word = self.0;
97         // See explanatory comments in is_ascii_lowercase
98         let invalid_case = if cfg!(target_endian = "little") {
99             !(word + 0x3f3f_3f1f) | (word + 0x2525_2505)
100         } else {
101             !(word + 0x1f3f_3f3f) | (word + 0x0525_2525)
102         };
103         (invalid_case & 0x8080_8080) == 0x8080_8080
104     }
105 
is_ascii_uppercase(&self) -> bool106     pub const fn is_ascii_uppercase(&self) -> bool {
107         let word = self.0;
108         // See explanatory comments in is_ascii_lowercase
109         let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
110         (invalid_case & 0x8080_8080) == 0x8080_8080
111     }
112 
is_ascii_alphabetic_lowercase(&self) -> bool113     pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
114         let word = self.0;
115         // `mask` sets all NUL bytes to 0.
116         let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
117         // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
118         let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
119         // The overall string is valid if every character passes at least one test.
120         // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
121         (lower_alpha & mask) == 0
122     }
123 
is_ascii_alphabetic_titlecase(&self) -> bool124     pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
125         let word = self.0;
126         // See explanatory comments in is_ascii_alphabetic_lowercase
127         let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
128         let title_case = if cfg!(target_endian = "little") {
129             !(word + 0x1f1f_1f3f) | (word + 0x0505_0525)
130         } else {
131             !(word + 0x3f1f_1f1f) | (word + 0x2505_0505)
132         };
133         (title_case & mask) == 0
134     }
135 
is_ascii_alphabetic_uppercase(&self) -> bool136     pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
137         let word = self.0;
138         // See explanatory comments in is_ascii_alphabetic_lowercase
139         let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
140         let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
141         (upper_alpha & mask) == 0
142     }
143 
to_ascii_lowercase(&self) -> Self144     pub const fn to_ascii_lowercase(&self) -> Self {
145         let word = self.0;
146         let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2);
147         Self(result)
148     }
149 
to_ascii_titlecase(&self) -> Self150     pub const fn to_ascii_titlecase(&self) -> Self {
151         let word = self.0.to_le();
152         let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2;
153         let result = (word | mask) & !(0x20 & mask);
154         Self(u32::from_le(result))
155     }
156 
to_ascii_uppercase(&self) -> Self157     pub const fn to_ascii_uppercase(&self) -> Self {
158         let word = self.0;
159         let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2);
160         Self(result)
161     }
162 }
163 
164 /// Internal helper struct that performs operations on aligned integers.
165 /// Supports strings up to 8 bytes long.
166 #[repr(transparent)]
167 pub struct Aligned8(u64);
168 
169 impl Aligned8 {
170     /// # Panics
171     /// Panics if N is greater than 8
172     #[inline]
from_utf8<const N: usize>(src: &[u8; N]) -> Self173     pub const fn from_utf8<const N: usize>(src: &[u8; N]) -> Self {
174         let mut bytes = [0; 8];
175         let mut i = 0;
176         // The function documentation defines when panics may occur
177         #[allow(clippy::indexing_slicing)]
178         while i < N {
179             bytes[i] = src[i];
180             i += 1;
181         }
182         Self(u64::from_ne_bytes(bytes))
183     }
184 
185     #[inline]
from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self186     pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
187         Self::from_utf8::<N>(unsafe { core::mem::transmute::<&[AsciiByte; N], &[u8; N]>(src) })
188     }
189 
190     #[inline]
to_bytes(&self) -> [u8; 8]191     pub const fn to_bytes(&self) -> [u8; 8] {
192         self.0.to_ne_bytes()
193     }
194 
195     #[inline]
to_ascii_bytes(&self) -> [AsciiByte; 8]196     pub const fn to_ascii_bytes(&self) -> [AsciiByte; 8] {
197         unsafe { core::mem::transmute(self.to_bytes()) }
198     }
199 
len(&self) -> usize200     pub const fn len(&self) -> usize {
201         let word = self.0;
202         #[cfg(target_endian = "little")]
203         let len = (8 - word.leading_zeros() / 8) as usize;
204         #[cfg(target_endian = "big")]
205         let len = (8 - word.trailing_zeros() / 8) as usize;
206         len
207     }
208 
is_ascii_alphabetic(&self) -> bool209     pub const fn is_ascii_alphabetic(&self) -> bool {
210         let word = self.0;
211         let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
212         let lower = word | 0x2020_2020_2020_2020;
213         let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
214         (alpha & mask) == 0
215     }
216 
is_ascii_alphanumeric(&self) -> bool217     pub const fn is_ascii_alphanumeric(&self) -> bool {
218         let word = self.0;
219         let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
220         let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
221         let lower = word | 0x2020_2020_2020_2020;
222         let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
223         (alpha & numeric & mask) == 0
224     }
225 
is_ascii_numeric(&self) -> bool226     pub const fn is_ascii_numeric(&self) -> bool {
227         let word = self.0;
228         let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
229         let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
230         (numeric & mask) == 0
231     }
232 
is_ascii_lowercase(&self) -> bool233     pub const fn is_ascii_lowercase(&self) -> bool {
234         let word = self.0;
235         let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
236         (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
237     }
238 
is_ascii_titlecase(&self) -> bool239     pub const fn is_ascii_titlecase(&self) -> bool {
240         let word = self.0;
241         let invalid_case = if cfg!(target_endian = "little") {
242             !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505)
243         } else {
244             !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525)
245         };
246         (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
247     }
248 
is_ascii_uppercase(&self) -> bool249     pub const fn is_ascii_uppercase(&self) -> bool {
250         let word = self.0;
251         let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
252         (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
253     }
254 
is_ascii_alphabetic_lowercase(&self) -> bool255     pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
256         let word = self.0;
257         // `mask` sets all NUL bytes to 0.
258         let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
259         // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
260         let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
261         // The overall string is valid if every character passes at least one test.
262         // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
263         (lower_alpha & mask) == 0
264     }
265 
is_ascii_alphabetic_titlecase(&self) -> bool266     pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
267         let word = self.0;
268         // See explanatory comments in is_ascii_alphabetic_lowercase
269         let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
270         let title_case = if cfg!(target_endian = "little") {
271             !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525)
272         } else {
273             !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505)
274         };
275         (title_case & mask) == 0
276     }
277 
is_ascii_alphabetic_uppercase(&self) -> bool278     pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
279         let word = self.0;
280         // See explanatory comments in is_ascii_alphabetic_lowercase
281         let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
282         let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
283         (upper_alpha & mask) == 0
284     }
285 
to_ascii_lowercase(&self) -> Self286     pub const fn to_ascii_lowercase(&self) -> Self {
287         let word = self.0;
288         let result = word
289             | (((word + 0x3f3f_3f3f_3f3f_3f3f)
290                 & !(word + 0x2525_2525_2525_2525)
291                 & 0x8080_8080_8080_8080)
292                 >> 2);
293         Self(result)
294     }
295 
to_ascii_titlecase(&self) -> Self296     pub const fn to_ascii_titlecase(&self) -> Self {
297         let word = self.0.to_le();
298         let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f)
299             & !(word + 0x2525_2525_2525_2505)
300             & 0x8080_8080_8080_8080)
301             >> 2;
302         let result = (word | mask) & !(0x20 & mask);
303         Self(u64::from_le(result))
304     }
305 
to_ascii_uppercase(&self) -> Self306     pub const fn to_ascii_uppercase(&self) -> Self {
307         let word = self.0;
308         let result = word
309             & !(((word + 0x1f1f_1f1f_1f1f_1f1f)
310                 & !(word + 0x0505_0505_0505_0505)
311                 & 0x8080_8080_8080_8080)
312                 >> 2);
313         Self(result)
314     }
315 }
316