1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! The primary per-codepoint casefolding data 6 7 #[cfg(feature = "datagen")] 8 use alloc::collections::BTreeMap; 9 use core::num::TryFromIntError; 10 use icu_collections::codepointtrie::TrieValue; 11 use zerovec::ule::{AsULE, RawBytesULE, UleError, ULE}; 12 13 /// The case of a Unicode character 14 /// 15 /// <div class="stab unstable"> 16 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 17 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 18 /// to be stable, their Rust representation might not be. Use with caution. 19 /// </div> 20 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 21 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 22 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 23 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))] 24 pub enum CaseType { 25 /// Lowercase letter 26 Lower = 1, 27 /// Uppercase letter 28 Upper = 2, 29 /// Titlecase letter 30 Title = 3, 31 } 32 33 impl CaseType { 34 pub(crate) const CASE_MASK: u16 = 0x3; 35 36 // The casetype is stored in the codepoint trie as two bits. 37 // After masking them to get a value between 0 and 3, this 38 // function converts to `CaseType`. 39 // 40 // Returns `None` for uncased 41 #[inline] from_masked_bits(b: u16) -> Option<Self>42 pub(crate) fn from_masked_bits(b: u16) -> Option<Self> { 43 debug_assert!(b & Self::CASE_MASK == b); 44 match b { 45 0 => None, 46 1 => Some(CaseType::Lower), 47 2 => Some(CaseType::Upper), 48 _ => Some(CaseType::Title), 49 } 50 } 51 } 52 53 /// The dot type of a Unicode character. This indicates how dotted 54 /// letters (like `i` and `j`) combine with accents placed above the 55 /// letter. 56 /// 57 /// <div class="stab unstable"> 58 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 59 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 60 /// to be stable, their Rust representation might not be. Use with caution. 61 /// </div> 62 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 63 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 64 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 65 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))] 66 #[derive(Default)] 67 pub enum DotType { 68 /// Normal characters with combining class 0 69 #[default] 70 NoDot = 0, 71 /// Soft-dotted characters with combining class 0 72 SoftDotted = 1, 73 /// "Above" accents with combining class 230 74 Above = 2, 75 /// Other accent characters 76 OtherAccent = 3, 77 } 78 79 impl DotType { 80 pub(crate) const DOT_MASK: u16 = 0x3; 81 82 // The dot type is stored in either the codepoint trie or the 83 // exception table as two bits. After shifting and masking them 84 // to get a value between 0 and 3, this function converts to 85 // DotType. 86 #[inline] from_masked_bits(b: u16) -> Self87 pub(crate) fn from_masked_bits(b: u16) -> Self { 88 debug_assert!(b & Self::DOT_MASK == b); 89 match b { 90 0 => DotType::NoDot, 91 1 => DotType::SoftDotted, 92 2 => DotType::Above, 93 _ => DotType::OtherAccent, 94 } 95 } 96 } 97 98 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 99 pub(crate) enum MappingKind { 100 Lower = 0, 101 Fold = 1, 102 Upper = 2, 103 Title = 3, 104 } 105 106 /// Case mapping data associated with a single code point 107 /// 108 /// <div class="stab unstable"> 109 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 110 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 111 /// to be stable, their Rust representation might not be. Use with caution. 112 /// </div> 113 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 114 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 115 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 116 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))] 117 pub struct CaseMapData { 118 /// Whether this is default-ignoreable 119 pub ignoreable: bool, 120 /// The rest of the case mapping data 121 pub kind: CaseMapDataKind, 122 } 123 124 /// A subset of case mapping data associated with a single code point 125 /// 126 /// <div class="stab unstable"> 127 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 128 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 129 /// to be stable, their Rust representation might not be. Use with caution. 130 /// </div> 131 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 132 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 133 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))] 134 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 135 pub enum CaseMapDataKind { 136 /// This code point is an exception. Provides the case type of its own case 137 /// and the exception index stored in [`CaseMapExceptions`] 138 /// 139 /// [`CaseMapExceptions`]: crate::provider::exceptions::CaseMapExceptions 140 Exception(Option<CaseType>, u16), 141 /// This code point is uncased, and has the following extra data 142 Uncased(NonExceptionData), 143 /// This code point is cased. We store the extra data, its case type, and a *delta* 144 /// that can be used to get its casemapped codepoint. 145 Delta(NonExceptionData, CaseType, i16), 146 } 147 148 /// Data that is stored in CaseMapData when it is *not* an exception 149 /// 150 /// <div class="stab unstable"> 151 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 152 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 153 /// to be stable, their Rust representation might not be. Use with caution. 154 /// </div> 155 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 156 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 157 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))] 158 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 159 pub struct NonExceptionData { 160 /// Whether or not the type is case-sensitive 161 pub sensitive: bool, 162 /// The "dot type" 163 pub dot_type: DotType, 164 } 165 166 impl CaseMapData { 167 #[inline] case_type(self) -> Option<CaseType>168 pub(crate) fn case_type(self) -> Option<CaseType> { 169 match self.kind { 170 CaseMapDataKind::Exception(case_type, ..) => case_type, 171 CaseMapDataKind::Delta(_, case_type, _) => Some(case_type), 172 CaseMapDataKind::Uncased(..) => None, 173 } 174 } 175 176 #[inline] is_upper_or_title(self) -> bool177 pub(crate) fn is_upper_or_title(self) -> bool { 178 match self.case_type() { 179 None | Some(CaseType::Lower) => false, 180 Some(CaseType::Upper) | Some(CaseType::Title) => true, 181 } 182 } 183 184 #[inline] is_relevant_to(self, kind: MappingKind) -> bool185 pub(crate) fn is_relevant_to(self, kind: MappingKind) -> bool { 186 match kind { 187 MappingKind::Lower | MappingKind::Fold => self.is_upper_or_title(), 188 MappingKind::Upper | MappingKind::Title => self.case_type() == Some(CaseType::Lower), 189 } 190 } 191 192 #[inline] is_ignorable(self) -> bool193 pub(crate) fn is_ignorable(self) -> bool { 194 self.ignoreable 195 } 196 197 #[inline] has_exception(self) -> bool198 pub(crate) fn has_exception(self) -> bool { 199 matches!(self.kind, CaseMapDataKind::Exception(..)) 200 } 201 202 // Returns true if this code point is case-sensitive. 203 // only in the non-exception case 204 // This is not currently exposed. 205 #[inline] is_sensitive(self) -> bool206 pub(crate) fn is_sensitive(self) -> bool { 207 match self.kind { 208 CaseMapDataKind::Exception(..) => false, 209 CaseMapDataKind::Delta(ned, ..) => ned.sensitive, 210 CaseMapDataKind::Uncased(ned) => ned.sensitive, 211 } 212 } 213 214 #[inline] dot_type(self) -> DotType215 pub(crate) fn dot_type(self) -> DotType { 216 match self.kind { 217 CaseMapDataKind::Exception(..) => DotType::NoDot, 218 CaseMapDataKind::Delta(ned, ..) => ned.dot_type, 219 CaseMapDataKind::Uncased(ned) => ned.dot_type, 220 } 221 } 222 223 // The delta between this code point and its upper/lowercase equivalent. 224 // This should only be called for codepoints without exception data. 225 // 226 // Returns 0 for uncased types 227 #[inline] delta(self) -> i16228 pub(crate) fn delta(self) -> i16 { 229 debug_assert!(!self.has_exception()); 230 match self.kind { 231 CaseMapDataKind::Exception(..) => 0, 232 CaseMapDataKind::Delta(.., delta) => delta, 233 CaseMapDataKind::Uncased(..) => 0, 234 } 235 } 236 237 // The index of the exception data for this codepoint in the exception 238 // table. This should only be called for codepoints with exception data. 239 #[inline] exception_index(self) -> u16240 pub(crate) fn exception_index(self) -> u16 { 241 debug_assert!(self.has_exception()); 242 if let CaseMapDataKind::Exception(_, i) = self.kind { 243 i 244 } else { 245 0 246 } 247 } 248 249 // CaseMapExceptionsBuilder moves the full mapping and closure 250 // strings out of the exception table itself. This means that the 251 // exception index for a code point in ICU4X will be different 252 // from the exception index for the same codepoint in ICU4C. Given 253 // a mapping from old to new, this function updates the exception 254 // index if necessary. 255 #[cfg(feature = "datagen")] with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self256 pub(crate) fn with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self { 257 let kind = if let CaseMapDataKind::Exception(ty, index) = self.kind { 258 if let Some(updated_exception) = updates.get(&index) { 259 CaseMapDataKind::Exception(ty, *updated_exception) 260 } else { 261 self.kind 262 } 263 } else { 264 self.kind 265 }; 266 267 Self { kind, ..self } 268 } 269 270 /// Attempt to construct from ICU-format integer 271 #[cfg(any(feature = "datagen", test))] try_from_icu_integer(int: u16) -> Result<Self, UleError>272 pub(crate) fn try_from_icu_integer(int: u16) -> Result<Self, UleError> { 273 let raw = int.to_unaligned(); 274 CaseMapDataULE::validate_bytes(raw.as_bytes())?; 275 276 let this = Self::from_unaligned(CaseMapDataULE(raw)); 277 Ok(this) 278 } 279 } 280 281 impl TrieValue for CaseMapData { 282 type TryFromU32Error = TryFromIntError; 283 try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>284 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { 285 u16::try_from(i).map(|u| AsULE::from_unaligned(CaseMapDataULE(u.to_unaligned()))) 286 } 287 to_u32(self) -> u32288 fn to_u32(self) -> u32 { 289 u32::from(self.to_unaligned().0.as_unsigned_int()) 290 } 291 } 292 293 /// Packed casemappingdata type 294 /// 295 /// Data format, copied from ICU4C casepropsbuilder.cpp: 296 /// 297 /// ```text 298 /// Trie data word: 299 /// Bits 300 /// if(exception) { 301 /// 15..4 unsigned exception index 302 /// } else { 303 /// if(not uncased) { 304 /// 15..7 signed delta to simple case mapping code point 305 /// (add delta to input code point) 306 /// } else { 307 /// 15..7 reserved, 0 308 /// } 309 /// 6..5 0 normal character with cc=0 310 /// 1 soft-dotted character 311 /// 2 cc=230 312 /// 3 other cc 313 /// The runtime code relies on these two bits to be adjacent with this encoding. 314 /// } 315 /// 4 case-sensitive 316 /// 3 exception 317 /// 2 case-ignorable 318 /// 1..0 0 uncased 319 /// 1 lowercase 320 /// 2 uppercase 321 /// 3 titlecase 322 /// The runtime code relies on the case-ignorable and case type bits 2..0 323 /// to be the lowest bits with this encoding. 324 /// ``` 325 /// 326 /// <div class="stab unstable"> 327 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 328 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 329 /// to be stable, their Rust representation might not be. Use with caution. 330 /// </div> 331 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 332 #[repr(transparent)] 333 pub struct CaseMapDataULE(RawBytesULE<2>); 334 335 impl CaseMapDataULE { 336 // 1..0 case type 337 const CASE_TYPE_BITS: u16 = 0x3; 338 // 2 case-ignorable 339 const CASE_IGNOREABLE_BIT: u16 = 0x4; 340 // 3 exception 341 const EXCEPTION_BIT: u16 = 0x8; 342 // 4 case-sensitive 343 const CASE_SENSITIVE_BIT: u16 = 0x10; 344 // 15..4 unsigned exception index 345 const EXCEPTION_SHIFT: u16 = 4; 346 // 15..7 signed-delta to simple case mapping code point (or reserved) 347 const DELTA_SHIFT: u16 = 7; 348 // 6..5 dot type 349 const DOT_TYPE_BITS: u16 = 0x60; 350 const DOT_SHIFT: u16 = 5; 351 } 352 353 /// # Safety 354 /// 355 /// Safety checklist for `ULE`: 356 /// 357 /// 1. The type *must not* include any uninitialized or padding bytes: repr(transparent) 358 /// wrapper around ULE type 359 /// 2. The type must have an alignment of 1 byte: repr(transparent) wrapper around ULE type 360 /// 3. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice 361 /// would not represent a valid slice of this type: It does 362 /// 4. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice 363 /// cannot be used in its entirety (if its length is not a multiple of `size_of::<Self>()`): 364 /// it does, due to the RawBytesULE parse call 365 /// 5. All other methods *must* be left with their default impl, or else implemented according to 366 /// their respective safety guidelines: They have been 367 /// 6. The equality invariant is satisfied 368 unsafe impl ULE for CaseMapDataULE { validate_bytes(bytes: &[u8]) -> Result<(), UleError>369 fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> { 370 let sixteens = RawBytesULE::<2>::parse_bytes_to_slice(bytes)?; 371 372 for sixteen in sixteens { 373 let sixteen = sixteen.as_unsigned_int(); 374 // The type has reserved bits in the 375 // uncased + not exception case 376 if sixteen & Self::EXCEPTION_BIT == 0 { 377 // not an exception 378 if sixteen & Self::CASE_TYPE_BITS == 0 { 379 // uncased 380 if sixteen >> Self::DELTA_SHIFT != 0 { 381 // We have some used bits in the reserved zone! 382 return Err(UleError::parse::<Self>()); 383 } 384 } 385 } 386 } 387 Ok(()) 388 } 389 } 390 391 impl AsULE for CaseMapData { 392 type ULE = CaseMapDataULE; 393 from_unaligned(ule: Self::ULE) -> Self394 fn from_unaligned(ule: Self::ULE) -> Self { 395 let sixteen = ule.0.as_unsigned_int(); 396 397 let ignoreable = (sixteen & CaseMapDataULE::CASE_IGNOREABLE_BIT) != 0; 398 let exception = (sixteen & CaseMapDataULE::EXCEPTION_BIT) != 0; 399 400 let case_type = sixteen & CaseMapDataULE::CASE_TYPE_BITS; 401 let case_type = CaseType::from_masked_bits(case_type); 402 let kind = if exception { 403 // No need to mask first since the exception bits start at 15 404 let exception = sixteen >> CaseMapDataULE::EXCEPTION_SHIFT; 405 CaseMapDataKind::Exception(case_type, exception) 406 } else { 407 let dot_type = (sixteen & CaseMapDataULE::DOT_TYPE_BITS) >> CaseMapDataULE::DOT_SHIFT; 408 let dot_type = DotType::from_masked_bits(dot_type); 409 let sensitive = (sixteen & CaseMapDataULE::CASE_SENSITIVE_BIT) != 0; 410 let ned = NonExceptionData { 411 dot_type, 412 sensitive, 413 }; 414 if let Some(case_type) = case_type { 415 // no need to mask first since the delta bits start at 15 416 // We can also cast as i16 first so we do not have to 417 // sign-extend later 418 let delta = (sixteen as i16) >> CaseMapDataULE::DELTA_SHIFT; 419 CaseMapDataKind::Delta(ned, case_type, delta) 420 } else { 421 CaseMapDataKind::Uncased(ned) 422 } 423 }; 424 CaseMapData { ignoreable, kind } 425 } 426 to_unaligned(self) -> Self::ULE427 fn to_unaligned(self) -> Self::ULE { 428 let mut sixteen = 0; 429 if self.ignoreable { 430 sixteen |= CaseMapDataULE::CASE_IGNOREABLE_BIT; 431 } 432 match self.kind { 433 CaseMapDataKind::Exception(case_type, e) => { 434 sixteen |= CaseMapDataULE::EXCEPTION_BIT; 435 sixteen |= e << CaseMapDataULE::EXCEPTION_SHIFT; 436 sixteen |= case_type.map(|c| c as u16).unwrap_or(0); 437 } 438 CaseMapDataKind::Uncased(ned) => { 439 sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT; 440 if ned.sensitive { 441 sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT; 442 } 443 // Remaining bytes are left at zero 444 // case_type is Uncased (0) 445 } 446 CaseMapDataKind::Delta(ned, case_type, delta) => { 447 // First shift (which keeps the signedness), then cast to the 448 // right type 449 sixteen |= (delta << CaseMapDataULE::DELTA_SHIFT) as u16; 450 sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT; 451 if ned.sensitive { 452 sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT; 453 } 454 sixteen |= case_type as u16; 455 } 456 } 457 CaseMapDataULE(sixteen.to_unaligned()) 458 } 459 } 460 461 #[cfg(test)] 462 mod tests { 463 use super::*; 464 465 #[test] test_roundtrip()466 fn test_roundtrip() { 467 const TESTCASES: &[CaseMapData] = &[ 468 CaseMapData { 469 ignoreable: true, 470 kind: CaseMapDataKind::Exception(Some(CaseType::Title), 923), 471 }, 472 CaseMapData { 473 ignoreable: false, 474 kind: CaseMapDataKind::Exception(None, 923), 475 }, 476 CaseMapData { 477 ignoreable: true, 478 kind: CaseMapDataKind::Delta( 479 NonExceptionData { 480 sensitive: true, 481 dot_type: DotType::SoftDotted, 482 }, 483 CaseType::Upper, 484 50, 485 ), 486 }, 487 CaseMapData { 488 ignoreable: false, 489 kind: CaseMapDataKind::Delta( 490 NonExceptionData { 491 sensitive: true, 492 dot_type: DotType::SoftDotted, 493 }, 494 CaseType::Upper, 495 -50, 496 ), 497 }, 498 CaseMapData { 499 ignoreable: false, 500 kind: CaseMapDataKind::Uncased(NonExceptionData { 501 sensitive: false, 502 dot_type: DotType::SoftDotted, 503 }), 504 }, 505 ]; 506 507 for case in TESTCASES { 508 let ule = case.to_unaligned(); 509 let roundtrip = CaseMapData::from_unaligned(ule); 510 assert_eq!(*case, roundtrip); 511 let integer = ule.0.as_unsigned_int(); 512 let roundtrip2 = CaseMapData::try_from_icu_integer(integer).unwrap(); 513 assert_eq!(*case, roundtrip2); 514 } 515 } 516 #[test] test_integer_roundtrip()517 fn test_integer_roundtrip() { 518 // Buggy roundtrip cases go here 519 fn test_single_integer(int: u16) { 520 let cmd = CaseMapData::try_from_icu_integer(int).unwrap(); 521 assert_eq!(int, cmd.to_unaligned().0.as_unsigned_int()) 522 } 523 524 test_single_integer(84); 525 test_single_integer(2503); 526 } 527 } 528