1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 use crate::asciibyte::AsciiByte; 6 7 /// Internal helper struct that performs operations on aligned integers. 8 /// Supports strings up to 4 bytes long. 9 #[repr(transparent)] 10 pub struct Aligned4(u32); 11 12 impl Aligned4 { 13 /// # Panics 14 /// Panics if N is greater than 4 15 #[inline] from_utf8<const N: usize>(src: &[u8; N]) -> Self16 pub const fn from_utf8<const N: usize>(src: &[u8; N]) -> Self { 17 let mut bytes = [0; 4]; 18 let mut i = 0; 19 // The function documentation defines when panics may occur 20 #[allow(clippy::indexing_slicing)] 21 while i < N { 22 bytes[i] = src[i]; 23 i += 1; 24 } 25 Self(u32::from_ne_bytes(bytes)) 26 } 27 28 #[inline] from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self29 pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self { 30 Self::from_utf8::<N>(unsafe { core::mem::transmute::<&[AsciiByte; N], &[u8; N]>(src) }) 31 } 32 33 #[inline] to_bytes(&self) -> [u8; 4]34 pub const fn to_bytes(&self) -> [u8; 4] { 35 self.0.to_ne_bytes() 36 } 37 38 #[inline] to_ascii_bytes(&self) -> [AsciiByte; 4]39 pub const fn to_ascii_bytes(&self) -> [AsciiByte; 4] { 40 unsafe { core::mem::transmute(self.to_bytes()) } 41 } 42 len(&self) -> usize43 pub const fn len(&self) -> usize { 44 let word = self.0; 45 #[cfg(target_endian = "little")] 46 let len = (4 - word.leading_zeros() / 8) as usize; 47 #[cfg(target_endian = "big")] 48 let len = (4 - word.trailing_zeros() / 8) as usize; 49 len 50 } 51 is_ascii_alphabetic(&self) -> bool52 pub const fn is_ascii_alphabetic(&self) -> bool { 53 let word = self.0; 54 // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid. 55 // `mask` sets all NUL bytes to 0. 56 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; 57 // `lower` converts the string to lowercase. It may also change the value of non-alpha 58 // characters, but this does not matter for the alphabetic test that follows. 59 let lower = word | 0x2020_2020; 60 // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters. 61 let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505); 62 // The overall string is valid if every character passes at least one test. 63 // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`). 64 (alpha & mask) == 0 65 } 66 is_ascii_alphanumeric(&self) -> bool67 pub const fn is_ascii_alphanumeric(&self) -> bool { 68 let word = self.0; 69 // See explanatory comments in is_ascii_alphabetic 70 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; 71 let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646); 72 let lower = word | 0x2020_2020; 73 let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505); 74 (alpha & numeric & mask) == 0 75 } 76 is_ascii_numeric(&self) -> bool77 pub const fn is_ascii_numeric(&self) -> bool { 78 let word = self.0; 79 // See explanatory comments in is_ascii_alphabetic 80 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; 81 let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646); 82 (numeric & mask) == 0 83 } 84 is_ascii_lowercase(&self) -> bool85 pub const fn is_ascii_lowercase(&self) -> bool { 86 let word = self.0; 87 // For efficiency, this function tests for an invalid string rather than a valid string. 88 // A string is ASCII lowercase iff it contains no uppercase ASCII characters. 89 // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1. 90 let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525); 91 // The string is valid if it contains no invalid characters (if all high bits are 1). 92 (invalid_case & 0x8080_8080) == 0x8080_8080 93 } 94 is_ascii_titlecase(&self) -> bool95 pub const fn is_ascii_titlecase(&self) -> bool { 96 let word = self.0; 97 // See explanatory comments in is_ascii_lowercase 98 let invalid_case = if cfg!(target_endian = "little") { 99 !(word + 0x3f3f_3f1f) | (word + 0x2525_2505) 100 } else { 101 !(word + 0x1f3f_3f3f) | (word + 0x0525_2525) 102 }; 103 (invalid_case & 0x8080_8080) == 0x8080_8080 104 } 105 is_ascii_uppercase(&self) -> bool106 pub const fn is_ascii_uppercase(&self) -> bool { 107 let word = self.0; 108 // See explanatory comments in is_ascii_lowercase 109 let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505); 110 (invalid_case & 0x8080_8080) == 0x8080_8080 111 } 112 is_ascii_alphabetic_lowercase(&self) -> bool113 pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { 114 let word = self.0; 115 // `mask` sets all NUL bytes to 0. 116 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; 117 // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1. 118 let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505); 119 // The overall string is valid if every character passes at least one test. 120 // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`). 121 (lower_alpha & mask) == 0 122 } 123 is_ascii_alphabetic_titlecase(&self) -> bool124 pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { 125 let word = self.0; 126 // See explanatory comments in is_ascii_alphabetic_lowercase 127 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; 128 let title_case = if cfg!(target_endian = "little") { 129 !(word + 0x1f1f_1f3f) | (word + 0x0505_0525) 130 } else { 131 !(word + 0x3f1f_1f1f) | (word + 0x2505_0505) 132 }; 133 (title_case & mask) == 0 134 } 135 is_ascii_alphabetic_uppercase(&self) -> bool136 pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { 137 let word = self.0; 138 // See explanatory comments in is_ascii_alphabetic_lowercase 139 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; 140 let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525); 141 (upper_alpha & mask) == 0 142 } 143 to_ascii_lowercase(&self) -> Self144 pub const fn to_ascii_lowercase(&self) -> Self { 145 let word = self.0; 146 let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2); 147 Self(result) 148 } 149 to_ascii_titlecase(&self) -> Self150 pub const fn to_ascii_titlecase(&self) -> Self { 151 let word = self.0.to_le(); 152 let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2; 153 let result = (word | mask) & !(0x20 & mask); 154 Self(u32::from_le(result)) 155 } 156 to_ascii_uppercase(&self) -> Self157 pub const fn to_ascii_uppercase(&self) -> Self { 158 let word = self.0; 159 let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2); 160 Self(result) 161 } 162 } 163 164 /// Internal helper struct that performs operations on aligned integers. 165 /// Supports strings up to 8 bytes long. 166 #[repr(transparent)] 167 pub struct Aligned8(u64); 168 169 impl Aligned8 { 170 /// # Panics 171 /// Panics if N is greater than 8 172 #[inline] from_utf8<const N: usize>(src: &[u8; N]) -> Self173 pub const fn from_utf8<const N: usize>(src: &[u8; N]) -> Self { 174 let mut bytes = [0; 8]; 175 let mut i = 0; 176 // The function documentation defines when panics may occur 177 #[allow(clippy::indexing_slicing)] 178 while i < N { 179 bytes[i] = src[i]; 180 i += 1; 181 } 182 Self(u64::from_ne_bytes(bytes)) 183 } 184 185 #[inline] from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self186 pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self { 187 Self::from_utf8::<N>(unsafe { core::mem::transmute::<&[AsciiByte; N], &[u8; N]>(src) }) 188 } 189 190 #[inline] to_bytes(&self) -> [u8; 8]191 pub const fn to_bytes(&self) -> [u8; 8] { 192 self.0.to_ne_bytes() 193 } 194 195 #[inline] to_ascii_bytes(&self) -> [AsciiByte; 8]196 pub const fn to_ascii_bytes(&self) -> [AsciiByte; 8] { 197 unsafe { core::mem::transmute(self.to_bytes()) } 198 } 199 len(&self) -> usize200 pub const fn len(&self) -> usize { 201 let word = self.0; 202 #[cfg(target_endian = "little")] 203 let len = (8 - word.leading_zeros() / 8) as usize; 204 #[cfg(target_endian = "big")] 205 let len = (8 - word.trailing_zeros() / 8) as usize; 206 len 207 } 208 is_ascii_alphabetic(&self) -> bool209 pub const fn is_ascii_alphabetic(&self) -> bool { 210 let word = self.0; 211 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; 212 let lower = word | 0x2020_2020_2020_2020; 213 let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505); 214 (alpha & mask) == 0 215 } 216 is_ascii_alphanumeric(&self) -> bool217 pub const fn is_ascii_alphanumeric(&self) -> bool { 218 let word = self.0; 219 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; 220 let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646); 221 let lower = word | 0x2020_2020_2020_2020; 222 let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505); 223 (alpha & numeric & mask) == 0 224 } 225 is_ascii_numeric(&self) -> bool226 pub const fn is_ascii_numeric(&self) -> bool { 227 let word = self.0; 228 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; 229 let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646); 230 (numeric & mask) == 0 231 } 232 is_ascii_lowercase(&self) -> bool233 pub const fn is_ascii_lowercase(&self) -> bool { 234 let word = self.0; 235 let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525); 236 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 237 } 238 is_ascii_titlecase(&self) -> bool239 pub const fn is_ascii_titlecase(&self) -> bool { 240 let word = self.0; 241 let invalid_case = if cfg!(target_endian = "little") { 242 !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505) 243 } else { 244 !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525) 245 }; 246 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 247 } 248 is_ascii_uppercase(&self) -> bool249 pub const fn is_ascii_uppercase(&self) -> bool { 250 let word = self.0; 251 let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505); 252 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 253 } 254 is_ascii_alphabetic_lowercase(&self) -> bool255 pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { 256 let word = self.0; 257 // `mask` sets all NUL bytes to 0. 258 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; 259 // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1. 260 let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505); 261 // The overall string is valid if every character passes at least one test. 262 // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`). 263 (lower_alpha & mask) == 0 264 } 265 is_ascii_alphabetic_titlecase(&self) -> bool266 pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { 267 let word = self.0; 268 // See explanatory comments in is_ascii_alphabetic_lowercase 269 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; 270 let title_case = if cfg!(target_endian = "little") { 271 !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525) 272 } else { 273 !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505) 274 }; 275 (title_case & mask) == 0 276 } 277 is_ascii_alphabetic_uppercase(&self) -> bool278 pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { 279 let word = self.0; 280 // See explanatory comments in is_ascii_alphabetic_lowercase 281 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; 282 let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525); 283 (upper_alpha & mask) == 0 284 } 285 to_ascii_lowercase(&self) -> Self286 pub const fn to_ascii_lowercase(&self) -> Self { 287 let word = self.0; 288 let result = word 289 | (((word + 0x3f3f_3f3f_3f3f_3f3f) 290 & !(word + 0x2525_2525_2525_2525) 291 & 0x8080_8080_8080_8080) 292 >> 2); 293 Self(result) 294 } 295 to_ascii_titlecase(&self) -> Self296 pub const fn to_ascii_titlecase(&self) -> Self { 297 let word = self.0.to_le(); 298 let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f) 299 & !(word + 0x2525_2525_2525_2505) 300 & 0x8080_8080_8080_8080) 301 >> 2; 302 let result = (word | mask) & !(0x20 & mask); 303 Self(u64::from_le(result)) 304 } 305 to_ascii_uppercase(&self) -> Self306 pub const fn to_ascii_uppercase(&self) -> Self { 307 let word = self.0; 308 let result = word 309 & !(((word + 0x1f1f_1f1f_1f1f_1f1f) 310 & !(word + 0x0505_0505_0505_0505) 311 & 0x8080_8080_8080_8080) 312 >> 2); 313 Self(result) 314 } 315 } 316