1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! This is the main module pertaining to casemapping exceptions. 6 //! 7 //! A single exception is represented by the [`Exception`] type and its ULE equivalent. 8 //! 9 //! The storage format is complicated (and documented on [`Exception`]), but the data format is 10 //! represented equally by [`DecodedException`], which is more human-readable. 11 use icu_provider::prelude::*; 12 13 use super::data::MappingKind; 14 use super::exception_helpers::{ExceptionBits, ExceptionSlot, SlotPresence}; 15 use crate::set::ClosureSink; 16 use alloc::borrow::Cow; 17 use core::fmt; 18 #[cfg(any(feature = "serde", feature = "datagen"))] 19 use core::ops::Range; 20 use core::ptr; 21 use zerovec::ule::AsULE; 22 use zerovec::VarZeroVec; 23 24 const SURROGATES_START: u32 = 0xD800; 25 const SURROGATES_LEN: u32 = 0xDFFF - SURROGATES_START + 1; 26 27 /// This represents case mapping exceptions that can't be represented as a delta applied to 28 /// the original code point. The codepoint 29 /// trie in CaseMapper stores indices into this VarZeroVec. 30 /// 31 /// <div class="stab unstable"> 32 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 33 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 34 /// to be stable, their Rust representation might not be. Use with caution. 35 /// </div> 36 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 37 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 38 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::exceptions))] 39 #[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] 40 pub struct CaseMapExceptions<'data> { 41 #[cfg_attr(feature = "serde", serde(borrow))] 42 /// The list of exceptions 43 pub exceptions: VarZeroVec<'data, ExceptionULE>, 44 } 45 46 impl CaseMapExceptions<'_> { 47 /// Obtain the exception at index `idx`. Will 48 /// return a default value if not present (GIGO behavior), 49 /// as these indices should come from a paired CaseMapData object 50 /// 51 /// Will also panic in debug mode get(&self, idx: u16) -> &ExceptionULE52 pub fn get(&self, idx: u16) -> &ExceptionULE { 53 let exception = self.exceptions.get(idx.into()); 54 debug_assert!(exception.is_some()); 55 56 exception.unwrap_or(ExceptionULE::empty_exception()) 57 } 58 59 #[cfg(any(feature = "serde", feature = "datagen"))] validate(&self) -> Result<Range<u16>, &'static str>60 pub(crate) fn validate(&self) -> Result<Range<u16>, &'static str> { 61 for exception in self.exceptions.iter() { 62 exception.validate()?; 63 } 64 u16::try_from(self.exceptions.len()) 65 .map_err(|_| "Too many exceptions") 66 .map(|l| 0..l) 67 } 68 } 69 /// A type representing the wire format of `Exception`. The data contained is 70 /// equivalently represented by [`DecodedException`]. 71 /// 72 /// This type is itself not used that much, most of its relevant methods live 73 /// on [`ExceptionULE`]. 74 /// 75 /// The `bits` contain supplementary data, whereas 76 /// `slot_presence` marks te presence of various extra data 77 /// in the `data` field. 78 /// 79 /// The `data` field is not validated to contain all of this data, 80 /// this type will have GIGO behavior when constructed with invalid `data`. 81 /// 82 /// The format of `data` is documented on the field 83 /// 84 /// <div class="stab unstable"> 85 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 86 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 87 /// to be stable, their Rust representation might not be. Use with caution. 88 /// </div> 89 #[zerovec::make_varule(ExceptionULE)] 90 #[derive(PartialEq, Eq, Clone, Default, Debug)] 91 #[zerovec::skip_derive(Ord)] 92 #[cfg_attr( 93 feature = "serde", 94 derive(serde::Deserialize), 95 zerovec::derive(Deserialize) 96 )] 97 #[cfg_attr( 98 feature = "datagen", 99 derive(serde::Serialize), 100 zerovec::derive(Serialize) 101 )] 102 pub struct Exception<'a> { 103 /// The various bit based exception data associated with this. 104 /// 105 /// Format: Just a u8 of bitflags, some flags unused. See [`ExceptionBits`] and its ULE type for more. 106 pub bits: ExceptionBits, 107 /// Which slots are present in `data`. 108 /// 109 /// Format: a u8 of bitflags 110 pub slot_presence: SlotPresence, 111 /// Format : `[char slots] [optional closure length] [ closure slot ] [ full mappings data ]` 112 /// 113 /// For each set SlotPresence bit, except for the two stringy slots (Closure/FullMapping), 114 /// this will have one entry in the string, packed together. 115 /// 116 /// Note that the simple_case delta is stored as a u32 normalized to a `char`, where u32s 117 /// which are from or beyond the surrogate range 0xD800-0xDFFF are stored as chars 118 /// starting from 0xE000. The sign is stored in bits.negative_delta. 119 /// 120 /// If both Closure/FullMapping are present, the next char will be the length of the closure slot, 121 /// bisecting the rest of the data. 122 /// If only one is present, the rest of the data represents that slot. 123 /// 124 /// The closure slot simply represents one string. The full-mappings slot represents four strings, 125 /// packed in a way similar to VarZeroVec, in the following format: 126 /// `i1 i2 i3 [ str0 ] [ str1 ] [ str2 ] [ str3 ]` 127 /// 128 /// where `i1 i2 i3` are the indices of the relevant mappings string. The strings are stored in 129 /// the order corresponding to the MappingKind enum. 130 pub data: Cow<'a, str>, 131 } 132 133 impl ExceptionULE { 134 #[inline] empty_exception() -> &'static Self135 fn empty_exception() -> &'static Self { 136 static EMPTY_BYTES: &[u8] = &[0, 0]; 137 // Safety: 138 // ExceptionULE is a packed DST with `(u8, u8, unsized)` fields. All bit patterns are valid for the two u8s 139 // 140 // An "empty" one can be constructed from a slice of two u8s 141 unsafe { 142 let slice: *const [u8] = ptr::slice_from_raw_parts(EMPTY_BYTES.as_ptr(), 0); 143 &*(slice as *const Self) 144 } 145 } has_slot(&self, slot: ExceptionSlot) -> bool146 pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool { 147 self.slot_presence.has_slot(slot) 148 } 149 /// Obtain a `char` slot, if occupied. If `slot` represents a string slot, 150 /// will return `None` get_char_slot(&self, slot: ExceptionSlot) -> Option<char>151 pub(crate) fn get_char_slot(&self, slot: ExceptionSlot) -> Option<char> { 152 if slot >= ExceptionSlot::STRING_SLOTS_START { 153 return None; 154 } 155 let bit = 1 << (slot as u8); 156 // check if slot is occupied 157 if self.slot_presence.0 & bit == 0 { 158 return None; 159 } 160 161 let previous_slot_mask = bit - 1; 162 let previous_slots = self.slot_presence.0 & previous_slot_mask; 163 let slot_num = previous_slots.count_ones() as usize; 164 self.data.chars().nth(slot_num) 165 } 166 167 /// Get the `simple_case` delta (i.e. the `delta` slot), given the character 168 /// this data belongs to. 169 /// 170 /// Normalizes the delta from char-format to u32 format 171 /// 172 /// Does *not* handle the sign of the delta; see self.bits.negative_delta get_simple_case_delta(&self) -> Option<u32>173 fn get_simple_case_delta(&self) -> Option<u32> { 174 let delta_ch = self.get_char_slot(ExceptionSlot::Delta)?; 175 let mut delta = u32::from(delta_ch); 176 // We "fill in" the surrogates range by offsetting deltas greater than it 177 if delta >= SURROGATES_START { 178 delta -= SURROGATES_LEN; 179 } 180 Some(delta) 181 } 182 183 /// Get the `simple_case` value (i.e. the `delta` slot), given the character 184 /// this data belongs to. 185 /// 186 /// The data is stored as a delta so the character must be provided. 187 /// 188 /// The data cannot be stored directly as a character because the trie is more 189 /// compact with adjacent characters sharing deltas. get_simple_case_slot_for(&self, ch: char) -> Option<char>190 pub(crate) fn get_simple_case_slot_for(&self, ch: char) -> Option<char> { 191 let delta = self.get_simple_case_delta()?; 192 let mut delta = i32::try_from(delta).ok()?; 193 if self.bits.negative_delta() { 194 delta = -delta; 195 } 196 197 let new_ch = i32::try_from(u32::from(ch)).ok()? + delta; 198 199 char::try_from(u32::try_from(new_ch).ok()?).ok() 200 } 201 202 /// Returns *all* the data in the closure/full slots, including length metadata get_stringy_data(&self) -> Option<&str>203 fn get_stringy_data(&self) -> Option<&str> { 204 const CHAR_MASK: u8 = (1 << ExceptionSlot::STRING_SLOTS_START as u8) - 1; 205 let char_slot_count = (self.slot_presence.0 & CHAR_MASK).count_ones() as usize; 206 let mut chars = self.data.chars(); 207 for _ in 0..char_slot_count { 208 let res = chars.next(); 209 res?; 210 } 211 Some(chars.as_str()) 212 } 213 214 /// Returns a single stringy slot, either ExceptionSlot::Closure 215 /// or ExceptionSlot::FullMappings. get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str>216 fn get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str> { 217 debug_assert!(slot == ExceptionSlot::Closure || slot == ExceptionSlot::FullMappings); 218 let other_slot = if slot == ExceptionSlot::Closure { 219 ExceptionSlot::FullMappings 220 } else { 221 ExceptionSlot::Closure 222 }; 223 if !self.slot_presence.has_slot(slot) { 224 return None; 225 } 226 let stringy_data = self.get_stringy_data()?; 227 228 if self.slot_presence.has_slot(other_slot) { 229 // both stringy slots are used, we need a length 230 let mut chars = stringy_data.chars(); 231 // GIGO: to have two strings there must be a length, if not present return None 232 let length_char = chars.next()?; 233 234 let length = usize::try_from(u32::from(length_char)).unwrap_or(0); 235 // The length indexes into the string after the first char 236 let remaining_slice = chars.as_str(); 237 // GIGO: will return none if there wasn't enough space in this slot 238 if slot == ExceptionSlot::Closure { 239 remaining_slice.get(0..length) 240 } else { 241 remaining_slice.get(length..) 242 } 243 } else { 244 // only a single stringy slot, there is no length stored 245 Some(stringy_data) 246 } 247 } 248 249 /// Get the data behind the `closure` slot get_closure_slot(&self) -> Option<&str>250 pub(crate) fn get_closure_slot(&self) -> Option<&str> { 251 self.get_stringy_slot(ExceptionSlot::Closure) 252 } 253 254 /// Get all the slot data for the FullMappings slot 255 /// 256 /// This needs to be further segmented into four based on length metadata get_fullmappings_slot_data(&self) -> Option<&str>257 fn get_fullmappings_slot_data(&self) -> Option<&str> { 258 self.get_stringy_slot(ExceptionSlot::FullMappings) 259 } 260 261 /// Get a specific FullMappings slot value get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str>262 pub(crate) fn get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str> { 263 let data = self.get_fullmappings_slot_data()?; 264 265 let mut chars = data.chars(); 266 // GIGO: must have three index strings, else return None 267 let i1 = usize::try_from(u32::from(chars.next()?)).ok()?; 268 let i2 = usize::try_from(u32::from(chars.next()?)).ok()?; 269 let i3 = usize::try_from(u32::from(chars.next()?)).ok()?; 270 let remaining_slice = chars.as_str(); 271 // GIGO: if the indices are wrong, return None 272 match kind { 273 MappingKind::Lower => remaining_slice.get(..i1), 274 MappingKind::Fold => remaining_slice.get(i1..i2), 275 MappingKind::Upper => remaining_slice.get(i2..i3), 276 MappingKind::Title => remaining_slice.get(i3..), 277 } 278 } 279 280 // convenience function that lets us use the ? operator get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]>281 fn get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]> { 282 Some([ 283 self.get_fullmappings_slot_for_kind(MappingKind::Lower)? 284 .into(), 285 self.get_fullmappings_slot_for_kind(MappingKind::Fold)? 286 .into(), 287 self.get_fullmappings_slot_for_kind(MappingKind::Upper)? 288 .into(), 289 self.get_fullmappings_slot_for_kind(MappingKind::Title)? 290 .into(), 291 ]) 292 } 293 294 // Given a mapping kind, returns the character for that kind, if it exists. Fold falls 295 // back to Lower; Title falls back to Upper. 296 #[inline] slot_char_for_kind(&self, kind: MappingKind) -> Option<char>297 pub(crate) fn slot_char_for_kind(&self, kind: MappingKind) -> Option<char> { 298 match kind { 299 MappingKind::Lower | MappingKind::Upper => self.get_char_slot(kind.into()), 300 MappingKind::Fold => self 301 .get_char_slot(ExceptionSlot::Fold) 302 .or_else(|| self.get_char_slot(ExceptionSlot::Lower)), 303 MappingKind::Title => self 304 .get_char_slot(ExceptionSlot::Title) 305 .or_else(|| self.get_char_slot(ExceptionSlot::Upper)), 306 } 307 } 308 add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S)309 pub(crate) fn add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S) { 310 if let Some(full) = self.get_fullmappings_slot_for_kind(MappingKind::Fold) { 311 if !full.is_empty() { 312 set.add_string(full); 313 } 314 }; 315 if let Some(closure) = self.get_closure_slot() { 316 for c in closure.chars() { 317 set.add_char(c); 318 } 319 }; 320 } 321 322 /// Extract all the data out into a structured form 323 /// 324 /// Useful for serialization and debugging decode(&self) -> DecodedException<'_>325 pub fn decode(&self) -> DecodedException<'_> { 326 // Potential future optimization: This can 327 // directly access each bit one after the other and iterate the string 328 // which avoids recomputing slot offsets over and over again. 329 // 330 // If we're doing so we may wish to retain this older impl so that we can still roundtrip test 331 let bits = self.bits; 332 let lowercase = self.get_char_slot(ExceptionSlot::Lower); 333 let casefold = self.get_char_slot(ExceptionSlot::Fold); 334 let uppercase = self.get_char_slot(ExceptionSlot::Upper); 335 let titlecase = self.get_char_slot(ExceptionSlot::Title); 336 let simple_case_delta = self.get_simple_case_delta(); 337 let closure = self.get_closure_slot().map(Into::into); 338 let full = self.get_all_fullmapping_slots(); 339 340 DecodedException { 341 bits: ExceptionBits::from_unaligned(bits), 342 lowercase, 343 casefold, 344 uppercase, 345 titlecase, 346 simple_case_delta, 347 closure, 348 full, 349 } 350 } 351 352 #[cfg(any(feature = "serde", feature = "datagen"))] validate(&self) -> Result<(), &'static str>353 pub(crate) fn validate(&self) -> Result<(), &'static str> { 354 // check that ICU4C specific fields are not set 355 // check that there is enough space for all the offsets 356 if self.bits.double_width_slots() { 357 return Err("double-width-slots should not be used in ICU4C"); 358 } 359 360 // just run all of the slot getters at once and then check 361 let decoded = self.decode(); 362 363 for (slot, decoded_slot) in [ 364 (ExceptionSlot::Lower, &decoded.lowercase), 365 (ExceptionSlot::Fold, &decoded.casefold), 366 (ExceptionSlot::Upper, &decoded.uppercase), 367 (ExceptionSlot::Title, &decoded.titlecase), 368 ] { 369 if self.has_slot(slot) && decoded_slot.is_none() { 370 // decoding hit GIGO behavior, oops! 371 return Err("Slot decoding failed"); 372 } 373 } 374 if self.has_slot(ExceptionSlot::Delta) && decoded.simple_case_delta.is_none() { 375 // decoding hit GIGO behavior, oops! 376 return Err("Slot decoding failed"); 377 } 378 379 if self.has_slot(ExceptionSlot::Closure) && decoded.closure.is_none() { 380 return Err("Slot decoding failed"); 381 } 382 383 if self.has_slot(ExceptionSlot::FullMappings) { 384 if decoded.full.is_some() { 385 let data = self 386 .get_fullmappings_slot_data() 387 .ok_or("fullmappings slot doesn't parse")?; 388 let mut chars = data.chars(); 389 let i1 = u32::from(chars.next().ok_or("fullmappings string too small")?); 390 let i2 = u32::from(chars.next().ok_or("fullmappings string too small")?); 391 let i3 = u32::from(chars.next().ok_or("fullmappings string too small")?); 392 393 if i2 < i1 || i3 < i2 { 394 return Err("fullmappings string contains non-sequential indices"); 395 } 396 let rest = chars.as_str(); 397 let len = u32::try_from(rest.len()).map_err(|_| "len too large for u32")?; 398 399 if i1 > len || i2 > len || i3 > len { 400 return Err("fullmappings string contains out-of-bounds indices"); 401 } 402 } else { 403 return Err("Slot decoding failed"); 404 } 405 } 406 407 Ok(()) 408 } 409 } 410 411 impl fmt::Debug for ExceptionULE { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result412 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 413 self.decode().fmt(f) 414 } 415 } 416 417 /// A decoded [`Exception`] type, with all of the data parsed out into 418 /// separate fields. 419 /// 420 /// <div class="stab unstable"> 421 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 422 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 423 /// to be stable, their Rust representation might not be. Use with caution. 424 /// </div> 425 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 426 #[cfg_attr(feature = "datagen", derive(serde::Serialize))] 427 #[derive(Debug, Clone, PartialEq, Eq, Default)] 428 pub struct DecodedException<'a> { 429 /// The various bit-based data associated with this exception 430 pub bits: ExceptionBits, 431 /// Lowercase mapping 432 pub lowercase: Option<char>, 433 /// Case folding 434 pub casefold: Option<char>, 435 /// Uppercase mapping 436 pub uppercase: Option<char>, 437 /// Titlecase mapping 438 pub titlecase: Option<char>, 439 /// The simple casefold delta. Its sign is stored in bits.negative_delta 440 pub simple_case_delta: Option<u32>, 441 /// Closure mappings 442 pub closure: Option<Cow<'a, str>>, 443 /// The four full-mappings strings, indexed by MappingKind u8 value 444 pub full: Option<[Cow<'a, str>; 4]>, 445 } 446 447 impl DecodedException<'_> { 448 /// Convert to a wire-format encodeable (VarULE-encodeable) [`Exception`] encode(&self) -> Exception<'static>449 pub fn encode(&self) -> Exception<'static> { 450 let bits = self.bits; 451 let mut slot_presence = SlotPresence(0); 452 let mut data = alloc::string::String::new(); 453 if let Some(lowercase) = self.lowercase { 454 slot_presence.add_slot(ExceptionSlot::Lower); 455 data.push(lowercase) 456 } 457 if let Some(casefold) = self.casefold { 458 slot_presence.add_slot(ExceptionSlot::Fold); 459 data.push(casefold) 460 } 461 if let Some(uppercase) = self.uppercase { 462 slot_presence.add_slot(ExceptionSlot::Upper); 463 data.push(uppercase) 464 } 465 if let Some(titlecase) = self.titlecase { 466 slot_presence.add_slot(ExceptionSlot::Title); 467 data.push(titlecase) 468 } 469 if let Some(mut simple_case_delta) = self.simple_case_delta { 470 slot_presence.add_slot(ExceptionSlot::Delta); 471 472 if simple_case_delta >= SURROGATES_START { 473 simple_case_delta += SURROGATES_LEN; 474 } 475 let simple_case_delta = char::try_from(simple_case_delta).unwrap_or('\0'); 476 data.push(simple_case_delta) 477 } 478 479 if let Some(ref closure) = self.closure { 480 slot_presence.add_slot(ExceptionSlot::Closure); 481 if self.full.is_some() { 482 // GIGO: if the closure length is more than 0xD800 this will error. Plenty of space. 483 debug_assert!( 484 closure.len() < 0xD800, 485 "Found overlarge closure value when encoding exception" 486 ); 487 let len_char = u32::try_from(closure.len()) 488 .ok() 489 .and_then(|c| char::try_from(c).ok()) 490 .unwrap_or('\0'); 491 data.push(len_char); 492 } 493 data.push_str(closure); 494 } 495 if let Some(ref full) = self.full { 496 slot_presence.add_slot(ExceptionSlot::FullMappings); 497 let mut idx = 0; 498 // iterate all elements except the last, whose length we can calculate from context 499 for mapping in full.iter().take(3) { 500 idx += mapping.len(); 501 data.push(char::try_from(u32::try_from(idx).unwrap_or(0)).unwrap_or('\0')); 502 } 503 for mapping in full { 504 data.push_str(mapping); 505 } 506 } 507 Exception { 508 bits, 509 slot_presence, 510 data: data.into(), 511 } 512 } 513 514 // Potential optimization: Write an `EncodeAsVarULE` that 515 // directly produces an ExceptionULE 516 } 517 518 #[cfg(test)] 519 mod tests { 520 use super::*; 521 test_roundtrip_once(exception: DecodedException)522 fn test_roundtrip_once(exception: DecodedException) { 523 let encoded = exception.encode(); 524 let encoded = zerovec::ule::encode_varule_to_box(&encoded); 525 let decoded = encoded.decode(); 526 assert_eq!(decoded, exception); 527 } 528 529 #[test] test_roundtrip()530 fn test_roundtrip() { 531 test_roundtrip_once(DecodedException { 532 lowercase: Some('ø'), 533 ..Default::default() 534 }); 535 test_roundtrip_once(DecodedException { 536 titlecase: Some('X'), 537 lowercase: Some('ø'), 538 ..Default::default() 539 }); 540 test_roundtrip_once(DecodedException { 541 titlecase: Some('X'), 542 ..Default::default() 543 }); 544 test_roundtrip_once(DecodedException { 545 titlecase: Some('X'), 546 simple_case_delta: Some(0xE999), 547 closure: Some("hello world".into()), 548 ..Default::default() 549 }); 550 test_roundtrip_once(DecodedException { 551 simple_case_delta: Some(10), 552 closure: Some("hello world".into()), 553 full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]), 554 ..Default::default() 555 }); 556 test_roundtrip_once(DecodedException { 557 closure: Some("hello world".into()), 558 full: Some(["aa".into(), "ț".into(), "".into(), "å".into()]), 559 ..Default::default() 560 }); 561 test_roundtrip_once(DecodedException { 562 full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]), 563 ..Default::default() 564 }); 565 } 566 } 567