• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! This is the main module pertaining to casemapping exceptions.
6 //!
7 //! A single exception is represented by the [`Exception`] type and its ULE equivalent.
8 //!
9 //! The storage format is complicated (and documented on [`Exception`]), but the data format is
10 //! represented equally by [`DecodedException`], which is more human-readable.
11 use icu_provider::prelude::*;
12 
13 use super::data::MappingKind;
14 use super::exception_helpers::{ExceptionBits, ExceptionSlot, SlotPresence};
15 use crate::set::ClosureSink;
16 use alloc::borrow::Cow;
17 use core::fmt;
18 #[cfg(any(feature = "serde", feature = "datagen"))]
19 use core::ops::Range;
20 use core::ptr;
21 use zerovec::ule::AsULE;
22 use zerovec::VarZeroVec;
23 
24 const SURROGATES_START: u32 = 0xD800;
25 const SURROGATES_LEN: u32 = 0xDFFF - SURROGATES_START + 1;
26 
27 /// This represents case mapping exceptions that can't be represented as a delta applied to
28 /// the original code point. The codepoint
29 /// trie in CaseMapper stores indices into this VarZeroVec.
30 ///
31 /// <div class="stab unstable">
32 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
33 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
34 /// to be stable, their Rust representation might not be. Use with caution.
35 /// </div>
36 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
37 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
38 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::exceptions))]
39 #[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
40 pub struct CaseMapExceptions<'data> {
41     #[cfg_attr(feature = "serde", serde(borrow))]
42     /// The list of exceptions
43     pub exceptions: VarZeroVec<'data, ExceptionULE>,
44 }
45 
46 impl CaseMapExceptions<'_> {
47     /// Obtain the exception at index `idx`. Will
48     /// return a default value if not present (GIGO behavior),
49     /// as these indices should come from a paired CaseMapData object
50     ///
51     /// Will also panic in debug mode
get(&self, idx: u16) -> &ExceptionULE52     pub fn get(&self, idx: u16) -> &ExceptionULE {
53         let exception = self.exceptions.get(idx.into());
54         debug_assert!(exception.is_some());
55 
56         exception.unwrap_or(ExceptionULE::empty_exception())
57     }
58 
59     #[cfg(any(feature = "serde", feature = "datagen"))]
validate(&self) -> Result<Range<u16>, &'static str>60     pub(crate) fn validate(&self) -> Result<Range<u16>, &'static str> {
61         for exception in self.exceptions.iter() {
62             exception.validate()?;
63         }
64         u16::try_from(self.exceptions.len())
65             .map_err(|_| "Too many exceptions")
66             .map(|l| 0..l)
67     }
68 }
69 /// A type representing the wire format of `Exception`. The data contained is
70 /// equivalently represented by [`DecodedException`].
71 ///
72 /// This type is itself not used that much, most of its relevant methods live
73 /// on [`ExceptionULE`].
74 ///
75 /// The `bits` contain supplementary data, whereas
76 /// `slot_presence` marks te presence of various extra data
77 /// in the `data` field.
78 ///
79 /// The `data` field is not validated to contain all of this data,
80 /// this type will have GIGO behavior when constructed with invalid `data`.
81 ///
82 /// The format of `data` is documented on the field
83 ///
84 /// <div class="stab unstable">
85 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
86 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
87 /// to be stable, their Rust representation might not be. Use with caution.
88 /// </div>
89 #[zerovec::make_varule(ExceptionULE)]
90 #[derive(PartialEq, Eq, Clone, Default, Debug)]
91 #[zerovec::skip_derive(Ord)]
92 #[cfg_attr(
93     feature = "serde",
94     derive(serde::Deserialize),
95     zerovec::derive(Deserialize)
96 )]
97 #[cfg_attr(
98     feature = "datagen",
99     derive(serde::Serialize),
100     zerovec::derive(Serialize)
101 )]
102 pub struct Exception<'a> {
103     /// The various bit based exception data associated with this.
104     ///
105     /// Format: Just a u8 of bitflags, some flags unused. See [`ExceptionBits`] and its ULE type for more.
106     pub bits: ExceptionBits,
107     /// Which slots are present in `data`.
108     ///
109     /// Format: a u8 of bitflags
110     pub slot_presence: SlotPresence,
111     /// Format : `[char slots] [optional closure length] [ closure slot ] [ full mappings data ]`
112     ///
113     /// For each set SlotPresence bit, except for the two stringy slots (Closure/FullMapping),
114     /// this will have one entry in the string, packed together.
115     ///
116     /// Note that the simple_case delta is stored as a u32 normalized to a `char`, where u32s
117     /// which are from or beyond the surrogate range 0xD800-0xDFFF are stored as chars
118     /// starting from 0xE000. The sign is stored in bits.negative_delta.
119     ///
120     /// If both Closure/FullMapping are present, the next char will be the length of the closure slot,
121     /// bisecting the rest of the data.
122     /// If only one is present, the rest of the data represents that slot.
123     ///
124     /// The closure slot simply represents one string. The full-mappings slot represents four strings,
125     /// packed in a way similar to VarZeroVec, in the following format:
126     /// `i1 i2 i3 [ str0 ] [ str1 ] [ str2 ] [ str3 ]`
127     ///
128     /// where `i1 i2 i3` are the indices of the relevant mappings string. The strings are stored in
129     /// the order corresponding to the MappingKind enum.
130     pub data: Cow<'a, str>,
131 }
132 
133 impl ExceptionULE {
134     #[inline]
empty_exception() -> &'static Self135     fn empty_exception() -> &'static Self {
136         static EMPTY_BYTES: &[u8] = &[0, 0];
137         // Safety:
138         // ExceptionULE is a packed DST with `(u8, u8, unsized)` fields. All bit patterns are valid for the two u8s
139         //
140         // An "empty" one can be constructed from a slice of two u8s
141         unsafe {
142             let slice: *const [u8] = ptr::slice_from_raw_parts(EMPTY_BYTES.as_ptr(), 0);
143             &*(slice as *const Self)
144         }
145     }
has_slot(&self, slot: ExceptionSlot) -> bool146     pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool {
147         self.slot_presence.has_slot(slot)
148     }
149     /// Obtain a `char` slot, if occupied. If `slot` represents a string slot,
150     /// will return `None`
get_char_slot(&self, slot: ExceptionSlot) -> Option<char>151     pub(crate) fn get_char_slot(&self, slot: ExceptionSlot) -> Option<char> {
152         if slot >= ExceptionSlot::STRING_SLOTS_START {
153             return None;
154         }
155         let bit = 1 << (slot as u8);
156         // check if slot is occupied
157         if self.slot_presence.0 & bit == 0 {
158             return None;
159         }
160 
161         let previous_slot_mask = bit - 1;
162         let previous_slots = self.slot_presence.0 & previous_slot_mask;
163         let slot_num = previous_slots.count_ones() as usize;
164         self.data.chars().nth(slot_num)
165     }
166 
167     /// Get the `simple_case` delta (i.e. the `delta` slot), given the character
168     /// this data belongs to.
169     ///
170     /// Normalizes the delta from char-format to u32 format
171     ///
172     /// Does *not* handle the sign of the delta; see self.bits.negative_delta
get_simple_case_delta(&self) -> Option<u32>173     fn get_simple_case_delta(&self) -> Option<u32> {
174         let delta_ch = self.get_char_slot(ExceptionSlot::Delta)?;
175         let mut delta = u32::from(delta_ch);
176         // We "fill in" the surrogates range by offsetting deltas greater than it
177         if delta >= SURROGATES_START {
178             delta -= SURROGATES_LEN;
179         }
180         Some(delta)
181     }
182 
183     /// Get the `simple_case` value (i.e. the `delta` slot), given the character
184     /// this data belongs to.
185     ///
186     /// The data is stored as a delta so the character must be provided.
187     ///
188     /// The data cannot be stored directly as a character because the trie is more
189     /// compact with adjacent characters sharing deltas.
get_simple_case_slot_for(&self, ch: char) -> Option<char>190     pub(crate) fn get_simple_case_slot_for(&self, ch: char) -> Option<char> {
191         let delta = self.get_simple_case_delta()?;
192         let mut delta = i32::try_from(delta).ok()?;
193         if self.bits.negative_delta() {
194             delta = -delta;
195         }
196 
197         let new_ch = i32::try_from(u32::from(ch)).ok()? + delta;
198 
199         char::try_from(u32::try_from(new_ch).ok()?).ok()
200     }
201 
202     /// Returns *all* the data in the closure/full slots, including length metadata
get_stringy_data(&self) -> Option<&str>203     fn get_stringy_data(&self) -> Option<&str> {
204         const CHAR_MASK: u8 = (1 << ExceptionSlot::STRING_SLOTS_START as u8) - 1;
205         let char_slot_count = (self.slot_presence.0 & CHAR_MASK).count_ones() as usize;
206         let mut chars = self.data.chars();
207         for _ in 0..char_slot_count {
208             let res = chars.next();
209             res?;
210         }
211         Some(chars.as_str())
212     }
213 
214     /// Returns a single stringy slot, either ExceptionSlot::Closure
215     /// or ExceptionSlot::FullMappings.
get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str>216     fn get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str> {
217         debug_assert!(slot == ExceptionSlot::Closure || slot == ExceptionSlot::FullMappings);
218         let other_slot = if slot == ExceptionSlot::Closure {
219             ExceptionSlot::FullMappings
220         } else {
221             ExceptionSlot::Closure
222         };
223         if !self.slot_presence.has_slot(slot) {
224             return None;
225         }
226         let stringy_data = self.get_stringy_data()?;
227 
228         if self.slot_presence.has_slot(other_slot) {
229             // both stringy slots are used, we need a length
230             let mut chars = stringy_data.chars();
231             // GIGO: to have two strings there must be a length, if not present return None
232             let length_char = chars.next()?;
233 
234             let length = usize::try_from(u32::from(length_char)).unwrap_or(0);
235             // The length indexes into the string after the first char
236             let remaining_slice = chars.as_str();
237             // GIGO: will return none if there wasn't enough space in this slot
238             if slot == ExceptionSlot::Closure {
239                 remaining_slice.get(0..length)
240             } else {
241                 remaining_slice.get(length..)
242             }
243         } else {
244             // only a single stringy slot, there is no length stored
245             Some(stringy_data)
246         }
247     }
248 
249     /// Get the data behind the `closure` slot
get_closure_slot(&self) -> Option<&str>250     pub(crate) fn get_closure_slot(&self) -> Option<&str> {
251         self.get_stringy_slot(ExceptionSlot::Closure)
252     }
253 
254     /// Get all the slot data for the FullMappings slot
255     ///
256     /// This needs to be further segmented into four based on length metadata
get_fullmappings_slot_data(&self) -> Option<&str>257     fn get_fullmappings_slot_data(&self) -> Option<&str> {
258         self.get_stringy_slot(ExceptionSlot::FullMappings)
259     }
260 
261     /// Get a specific FullMappings slot value
get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str>262     pub(crate) fn get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str> {
263         let data = self.get_fullmappings_slot_data()?;
264 
265         let mut chars = data.chars();
266         // GIGO: must have three index strings, else return None
267         let i1 = usize::try_from(u32::from(chars.next()?)).ok()?;
268         let i2 = usize::try_from(u32::from(chars.next()?)).ok()?;
269         let i3 = usize::try_from(u32::from(chars.next()?)).ok()?;
270         let remaining_slice = chars.as_str();
271         // GIGO: if the indices are wrong, return None
272         match kind {
273             MappingKind::Lower => remaining_slice.get(..i1),
274             MappingKind::Fold => remaining_slice.get(i1..i2),
275             MappingKind::Upper => remaining_slice.get(i2..i3),
276             MappingKind::Title => remaining_slice.get(i3..),
277         }
278     }
279 
280     // convenience function that lets us use the ? operator
get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]>281     fn get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]> {
282         Some([
283             self.get_fullmappings_slot_for_kind(MappingKind::Lower)?
284                 .into(),
285             self.get_fullmappings_slot_for_kind(MappingKind::Fold)?
286                 .into(),
287             self.get_fullmappings_slot_for_kind(MappingKind::Upper)?
288                 .into(),
289             self.get_fullmappings_slot_for_kind(MappingKind::Title)?
290                 .into(),
291         ])
292     }
293 
294     // Given a mapping kind, returns the character for that kind, if it exists. Fold falls
295     // back to Lower; Title falls back to Upper.
296     #[inline]
slot_char_for_kind(&self, kind: MappingKind) -> Option<char>297     pub(crate) fn slot_char_for_kind(&self, kind: MappingKind) -> Option<char> {
298         match kind {
299             MappingKind::Lower | MappingKind::Upper => self.get_char_slot(kind.into()),
300             MappingKind::Fold => self
301                 .get_char_slot(ExceptionSlot::Fold)
302                 .or_else(|| self.get_char_slot(ExceptionSlot::Lower)),
303             MappingKind::Title => self
304                 .get_char_slot(ExceptionSlot::Title)
305                 .or_else(|| self.get_char_slot(ExceptionSlot::Upper)),
306         }
307     }
308 
add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S)309     pub(crate) fn add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S) {
310         if let Some(full) = self.get_fullmappings_slot_for_kind(MappingKind::Fold) {
311             if !full.is_empty() {
312                 set.add_string(full);
313             }
314         };
315         if let Some(closure) = self.get_closure_slot() {
316             for c in closure.chars() {
317                 set.add_char(c);
318             }
319         };
320     }
321 
322     /// Extract all the data out into a structured form
323     ///
324     /// Useful for serialization and debugging
decode(&self) -> DecodedException<'_>325     pub fn decode(&self) -> DecodedException<'_> {
326         // Potential future optimization: This can
327         // directly access each bit one after the other and iterate the string
328         // which avoids recomputing slot offsets over and over again.
329         //
330         // If we're doing so we may wish to retain this older impl so that we can still roundtrip test
331         let bits = self.bits;
332         let lowercase = self.get_char_slot(ExceptionSlot::Lower);
333         let casefold = self.get_char_slot(ExceptionSlot::Fold);
334         let uppercase = self.get_char_slot(ExceptionSlot::Upper);
335         let titlecase = self.get_char_slot(ExceptionSlot::Title);
336         let simple_case_delta = self.get_simple_case_delta();
337         let closure = self.get_closure_slot().map(Into::into);
338         let full = self.get_all_fullmapping_slots();
339 
340         DecodedException {
341             bits: ExceptionBits::from_unaligned(bits),
342             lowercase,
343             casefold,
344             uppercase,
345             titlecase,
346             simple_case_delta,
347             closure,
348             full,
349         }
350     }
351 
352     #[cfg(any(feature = "serde", feature = "datagen"))]
validate(&self) -> Result<(), &'static str>353     pub(crate) fn validate(&self) -> Result<(), &'static str> {
354         // check that ICU4C specific fields are not set
355         // check that there is enough space for all the offsets
356         if self.bits.double_width_slots() {
357             return Err("double-width-slots should not be used in ICU4C");
358         }
359 
360         // just run all of the slot getters at once and then check
361         let decoded = self.decode();
362 
363         for (slot, decoded_slot) in [
364             (ExceptionSlot::Lower, &decoded.lowercase),
365             (ExceptionSlot::Fold, &decoded.casefold),
366             (ExceptionSlot::Upper, &decoded.uppercase),
367             (ExceptionSlot::Title, &decoded.titlecase),
368         ] {
369             if self.has_slot(slot) && decoded_slot.is_none() {
370                 // decoding hit GIGO behavior, oops!
371                 return Err("Slot decoding failed");
372             }
373         }
374         if self.has_slot(ExceptionSlot::Delta) && decoded.simple_case_delta.is_none() {
375             // decoding hit GIGO behavior, oops!
376             return Err("Slot decoding failed");
377         }
378 
379         if self.has_slot(ExceptionSlot::Closure) && decoded.closure.is_none() {
380             return Err("Slot decoding failed");
381         }
382 
383         if self.has_slot(ExceptionSlot::FullMappings) {
384             if decoded.full.is_some() {
385                 let data = self
386                     .get_fullmappings_slot_data()
387                     .ok_or("fullmappings slot doesn't parse")?;
388                 let mut chars = data.chars();
389                 let i1 = u32::from(chars.next().ok_or("fullmappings string too small")?);
390                 let i2 = u32::from(chars.next().ok_or("fullmappings string too small")?);
391                 let i3 = u32::from(chars.next().ok_or("fullmappings string too small")?);
392 
393                 if i2 < i1 || i3 < i2 {
394                     return Err("fullmappings string contains non-sequential indices");
395                 }
396                 let rest = chars.as_str();
397                 let len = u32::try_from(rest.len()).map_err(|_| "len too large for u32")?;
398 
399                 if i1 > len || i2 > len || i3 > len {
400                     return Err("fullmappings string contains out-of-bounds indices");
401                 }
402             } else {
403                 return Err("Slot decoding failed");
404             }
405         }
406 
407         Ok(())
408     }
409 }
410 
411 impl fmt::Debug for ExceptionULE {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result412     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
413         self.decode().fmt(f)
414     }
415 }
416 
417 /// A decoded [`Exception`] type, with all of the data parsed out into
418 /// separate fields.
419 ///
420 /// <div class="stab unstable">
421 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
422 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
423 /// to be stable, their Rust representation might not be. Use with caution.
424 /// </div>
425 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
426 #[cfg_attr(feature = "datagen", derive(serde::Serialize))]
427 #[derive(Debug, Clone, PartialEq, Eq, Default)]
428 pub struct DecodedException<'a> {
429     /// The various bit-based data associated with this exception
430     pub bits: ExceptionBits,
431     /// Lowercase mapping
432     pub lowercase: Option<char>,
433     /// Case folding
434     pub casefold: Option<char>,
435     /// Uppercase mapping
436     pub uppercase: Option<char>,
437     /// Titlecase mapping
438     pub titlecase: Option<char>,
439     /// The simple casefold delta. Its sign is stored in bits.negative_delta
440     pub simple_case_delta: Option<u32>,
441     /// Closure mappings
442     pub closure: Option<Cow<'a, str>>,
443     /// The four full-mappings strings, indexed by MappingKind u8 value
444     pub full: Option<[Cow<'a, str>; 4]>,
445 }
446 
447 impl DecodedException<'_> {
448     /// Convert to a wire-format encodeable (VarULE-encodeable) [`Exception`]
encode(&self) -> Exception<'static>449     pub fn encode(&self) -> Exception<'static> {
450         let bits = self.bits;
451         let mut slot_presence = SlotPresence(0);
452         let mut data = alloc::string::String::new();
453         if let Some(lowercase) = self.lowercase {
454             slot_presence.add_slot(ExceptionSlot::Lower);
455             data.push(lowercase)
456         }
457         if let Some(casefold) = self.casefold {
458             slot_presence.add_slot(ExceptionSlot::Fold);
459             data.push(casefold)
460         }
461         if let Some(uppercase) = self.uppercase {
462             slot_presence.add_slot(ExceptionSlot::Upper);
463             data.push(uppercase)
464         }
465         if let Some(titlecase) = self.titlecase {
466             slot_presence.add_slot(ExceptionSlot::Title);
467             data.push(titlecase)
468         }
469         if let Some(mut simple_case_delta) = self.simple_case_delta {
470             slot_presence.add_slot(ExceptionSlot::Delta);
471 
472             if simple_case_delta >= SURROGATES_START {
473                 simple_case_delta += SURROGATES_LEN;
474             }
475             let simple_case_delta = char::try_from(simple_case_delta).unwrap_or('\0');
476             data.push(simple_case_delta)
477         }
478 
479         if let Some(ref closure) = self.closure {
480             slot_presence.add_slot(ExceptionSlot::Closure);
481             if self.full.is_some() {
482                 // GIGO: if the closure length is more than 0xD800 this will error. Plenty of space.
483                 debug_assert!(
484                     closure.len() < 0xD800,
485                     "Found overlarge closure value when encoding exception"
486                 );
487                 let len_char = u32::try_from(closure.len())
488                     .ok()
489                     .and_then(|c| char::try_from(c).ok())
490                     .unwrap_or('\0');
491                 data.push(len_char);
492             }
493             data.push_str(closure);
494         }
495         if let Some(ref full) = self.full {
496             slot_presence.add_slot(ExceptionSlot::FullMappings);
497             let mut idx = 0;
498             // iterate all elements except the last, whose length we can calculate from context
499             for mapping in full.iter().take(3) {
500                 idx += mapping.len();
501                 data.push(char::try_from(u32::try_from(idx).unwrap_or(0)).unwrap_or('\0'));
502             }
503             for mapping in full {
504                 data.push_str(mapping);
505             }
506         }
507         Exception {
508             bits,
509             slot_presence,
510             data: data.into(),
511         }
512     }
513 
514     // Potential optimization: Write an `EncodeAsVarULE` that
515     // directly produces an ExceptionULE
516 }
517 
518 #[cfg(test)]
519 mod tests {
520     use super::*;
521 
test_roundtrip_once(exception: DecodedException)522     fn test_roundtrip_once(exception: DecodedException) {
523         let encoded = exception.encode();
524         let encoded = zerovec::ule::encode_varule_to_box(&encoded);
525         let decoded = encoded.decode();
526         assert_eq!(decoded, exception);
527     }
528 
529     #[test]
test_roundtrip()530     fn test_roundtrip() {
531         test_roundtrip_once(DecodedException {
532             lowercase: Some('ø'),
533             ..Default::default()
534         });
535         test_roundtrip_once(DecodedException {
536             titlecase: Some('X'),
537             lowercase: Some('ø'),
538             ..Default::default()
539         });
540         test_roundtrip_once(DecodedException {
541             titlecase: Some('X'),
542             ..Default::default()
543         });
544         test_roundtrip_once(DecodedException {
545             titlecase: Some('X'),
546             simple_case_delta: Some(0xE999),
547             closure: Some("hello world".into()),
548             ..Default::default()
549         });
550         test_roundtrip_once(DecodedException {
551             simple_case_delta: Some(10),
552             closure: Some("hello world".into()),
553             full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]),
554             ..Default::default()
555         });
556         test_roundtrip_once(DecodedException {
557             closure: Some("hello world".into()),
558             full: Some(["aa".into(), "ț".into(), "".into(), "å".into()]),
559             ..Default::default()
560         });
561         test_roundtrip_once(DecodedException {
562             full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]),
563             ..Default::default()
564         });
565     }
566 }
567