• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! The primary per-codepoint casefolding data
6 
7 #[cfg(feature = "datagen")]
8 use alloc::collections::BTreeMap;
9 use core::num::TryFromIntError;
10 use icu_collections::codepointtrie::TrieValue;
11 use zerovec::ule::{AsULE, RawBytesULE, UleError, ULE};
12 
13 /// The case of a Unicode character
14 ///
15 /// <div class="stab unstable">
16 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
17 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
18 /// to be stable, their Rust representation might not be. Use with caution.
19 /// </div>
20 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
21 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
22 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
23 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
24 pub enum CaseType {
25     /// Lowercase letter
26     Lower = 1,
27     /// Uppercase letter
28     Upper = 2,
29     /// Titlecase letter
30     Title = 3,
31 }
32 
33 impl CaseType {
34     pub(crate) const CASE_MASK: u16 = 0x3;
35 
36     // The casetype is stored in the codepoint trie as two bits.
37     // After masking them to get a value between 0 and 3, this
38     // function converts to `CaseType`.
39     //
40     // Returns `None` for uncased
41     #[inline]
from_masked_bits(b: u16) -> Option<Self>42     pub(crate) fn from_masked_bits(b: u16) -> Option<Self> {
43         debug_assert!(b & Self::CASE_MASK == b);
44         match b {
45             0 => None,
46             1 => Some(CaseType::Lower),
47             2 => Some(CaseType::Upper),
48             _ => Some(CaseType::Title),
49         }
50     }
51 }
52 
53 /// The dot type of a Unicode character. This indicates how dotted
54 /// letters (like `i` and `j`) combine with accents placed above the
55 /// letter.
56 ///
57 /// <div class="stab unstable">
58 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
59 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
60 /// to be stable, their Rust representation might not be. Use with caution.
61 /// </div>
62 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
63 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
64 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
65 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
66 #[derive(Default)]
67 pub enum DotType {
68     /// Normal characters with combining class 0
69     #[default]
70     NoDot = 0,
71     /// Soft-dotted characters with combining class 0
72     SoftDotted = 1,
73     /// "Above" accents with combining class 230
74     Above = 2,
75     /// Other accent characters
76     OtherAccent = 3,
77 }
78 
79 impl DotType {
80     pub(crate) const DOT_MASK: u16 = 0x3;
81 
82     // The dot type is stored in either the codepoint trie or the
83     // exception table as two bits.  After shifting and masking them
84     // to get a value between 0 and 3, this function converts to
85     // DotType.
86     #[inline]
from_masked_bits(b: u16) -> Self87     pub(crate) fn from_masked_bits(b: u16) -> Self {
88         debug_assert!(b & Self::DOT_MASK == b);
89         match b {
90             0 => DotType::NoDot,
91             1 => DotType::SoftDotted,
92             2 => DotType::Above,
93             _ => DotType::OtherAccent,
94         }
95     }
96 }
97 
98 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
99 pub(crate) enum MappingKind {
100     Lower = 0,
101     Fold = 1,
102     Upper = 2,
103     Title = 3,
104 }
105 
106 /// Case mapping data associated with a single code point
107 ///
108 /// <div class="stab unstable">
109 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
110 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
111 /// to be stable, their Rust representation might not be. Use with caution.
112 /// </div>
113 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
114 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
115 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
116 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
117 pub struct CaseMapData {
118     /// Whether this is default-ignoreable
119     pub ignoreable: bool,
120     /// The rest of the case mapping data
121     pub kind: CaseMapDataKind,
122 }
123 
124 /// A subset of case mapping data associated with a single code point
125 ///
126 /// <div class="stab unstable">
127 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
128 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
129 /// to be stable, their Rust representation might not be. Use with caution.
130 /// </div>
131 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
132 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
133 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
134 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
135 pub enum CaseMapDataKind {
136     /// This code point is an exception. Provides the case type of its own case
137     /// and the exception index stored in [`CaseMapExceptions`]
138     ///
139     /// [`CaseMapExceptions`]: crate::provider::exceptions::CaseMapExceptions
140     Exception(Option<CaseType>, u16),
141     /// This code point is uncased, and has the following extra data
142     Uncased(NonExceptionData),
143     /// This code point is cased. We store the extra data, its case type, and a *delta*
144     /// that can be used to get its casemapped codepoint.
145     Delta(NonExceptionData, CaseType, i16),
146 }
147 
148 /// Data that is stored in CaseMapData when it is *not* an exception
149 ///
150 /// <div class="stab unstable">
151 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
152 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
153 /// to be stable, their Rust representation might not be. Use with caution.
154 /// </div>
155 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
156 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
157 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
158 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
159 pub struct NonExceptionData {
160     /// Whether or not the type is case-sensitive
161     pub sensitive: bool,
162     /// The "dot type"
163     pub dot_type: DotType,
164 }
165 
166 impl CaseMapData {
167     #[inline]
case_type(self) -> Option<CaseType>168     pub(crate) fn case_type(self) -> Option<CaseType> {
169         match self.kind {
170             CaseMapDataKind::Exception(case_type, ..) => case_type,
171             CaseMapDataKind::Delta(_, case_type, _) => Some(case_type),
172             CaseMapDataKind::Uncased(..) => None,
173         }
174     }
175 
176     #[inline]
is_upper_or_title(self) -> bool177     pub(crate) fn is_upper_or_title(self) -> bool {
178         match self.case_type() {
179             None | Some(CaseType::Lower) => false,
180             Some(CaseType::Upper) | Some(CaseType::Title) => true,
181         }
182     }
183 
184     #[inline]
is_relevant_to(self, kind: MappingKind) -> bool185     pub(crate) fn is_relevant_to(self, kind: MappingKind) -> bool {
186         match kind {
187             MappingKind::Lower | MappingKind::Fold => self.is_upper_or_title(),
188             MappingKind::Upper | MappingKind::Title => self.case_type() == Some(CaseType::Lower),
189         }
190     }
191 
192     #[inline]
is_ignorable(self) -> bool193     pub(crate) fn is_ignorable(self) -> bool {
194         self.ignoreable
195     }
196 
197     #[inline]
has_exception(self) -> bool198     pub(crate) fn has_exception(self) -> bool {
199         matches!(self.kind, CaseMapDataKind::Exception(..))
200     }
201 
202     // Returns true if this code point is case-sensitive.
203     // only in the non-exception case
204     // This is not currently exposed.
205     #[inline]
is_sensitive(self) -> bool206     pub(crate) fn is_sensitive(self) -> bool {
207         match self.kind {
208             CaseMapDataKind::Exception(..) => false,
209             CaseMapDataKind::Delta(ned, ..) => ned.sensitive,
210             CaseMapDataKind::Uncased(ned) => ned.sensitive,
211         }
212     }
213 
214     #[inline]
dot_type(self) -> DotType215     pub(crate) fn dot_type(self) -> DotType {
216         match self.kind {
217             CaseMapDataKind::Exception(..) => DotType::NoDot,
218             CaseMapDataKind::Delta(ned, ..) => ned.dot_type,
219             CaseMapDataKind::Uncased(ned) => ned.dot_type,
220         }
221     }
222 
223     // The delta between this code point and its upper/lowercase equivalent.
224     // This should only be called for codepoints without exception data.
225     //
226     // Returns 0 for uncased types
227     #[inline]
delta(self) -> i16228     pub(crate) fn delta(self) -> i16 {
229         debug_assert!(!self.has_exception());
230         match self.kind {
231             CaseMapDataKind::Exception(..) => 0,
232             CaseMapDataKind::Delta(.., delta) => delta,
233             CaseMapDataKind::Uncased(..) => 0,
234         }
235     }
236 
237     // The index of the exception data for this codepoint in the exception
238     // table. This should only be called for codepoints with exception data.
239     #[inline]
exception_index(self) -> u16240     pub(crate) fn exception_index(self) -> u16 {
241         debug_assert!(self.has_exception());
242         if let CaseMapDataKind::Exception(_, i) = self.kind {
243             i
244         } else {
245             0
246         }
247     }
248 
249     // CaseMapExceptionsBuilder moves the full mapping and closure
250     // strings out of the exception table itself. This means that the
251     // exception index for a code point in ICU4X will be different
252     // from the exception index for the same codepoint in ICU4C. Given
253     // a mapping from old to new, this function updates the exception
254     // index if necessary.
255     #[cfg(feature = "datagen")]
with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self256     pub(crate) fn with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self {
257         let kind = if let CaseMapDataKind::Exception(ty, index) = self.kind {
258             if let Some(updated_exception) = updates.get(&index) {
259                 CaseMapDataKind::Exception(ty, *updated_exception)
260             } else {
261                 self.kind
262             }
263         } else {
264             self.kind
265         };
266 
267         Self { kind, ..self }
268     }
269 
270     /// Attempt to construct from ICU-format integer
271     #[cfg(any(feature = "datagen", test))]
try_from_icu_integer(int: u16) -> Result<Self, UleError>272     pub(crate) fn try_from_icu_integer(int: u16) -> Result<Self, UleError> {
273         let raw = int.to_unaligned();
274         CaseMapDataULE::validate_bytes(raw.as_bytes())?;
275 
276         let this = Self::from_unaligned(CaseMapDataULE(raw));
277         Ok(this)
278     }
279 }
280 
281 impl TrieValue for CaseMapData {
282     type TryFromU32Error = TryFromIntError;
283 
try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>284     fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
285         u16::try_from(i).map(|u| AsULE::from_unaligned(CaseMapDataULE(u.to_unaligned())))
286     }
287 
to_u32(self) -> u32288     fn to_u32(self) -> u32 {
289         u32::from(self.to_unaligned().0.as_unsigned_int())
290     }
291 }
292 
293 /// Packed casemappingdata type
294 ///
295 /// Data format, copied from ICU4C casepropsbuilder.cpp:
296 ///
297 /// ```text
298 /// Trie data word:
299 /// Bits
300 /// if(exception) {
301 ///     15..4   unsigned exception index
302 /// } else {
303 ///     if(not uncased) {
304 ///         15..7   signed delta to simple case mapping code point
305 ///                 (add delta to input code point)
306 ///     } else {
307 ///         15..7   reserved, 0
308 ///     }
309 ///      6..5   0 normal character with cc=0
310 ///             1 soft-dotted character
311 ///             2 cc=230
312 ///             3 other cc
313 ///             The runtime code relies on these two bits to be adjacent with this encoding.
314 /// }
315 ///     4   case-sensitive
316 ///     3   exception
317 ///     2   case-ignorable
318 ///  1..0   0 uncased
319 ///         1 lowercase
320 ///         2 uppercase
321 ///         3 titlecase
322 ///         The runtime code relies on the case-ignorable and case type bits 2..0
323 ///         to be the lowest bits with this encoding.
324 /// ```
325 ///
326 /// <div class="stab unstable">
327 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
328 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
329 /// to be stable, their Rust representation might not be. Use with caution.
330 /// </div>
331 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
332 #[repr(transparent)]
333 pub struct CaseMapDataULE(RawBytesULE<2>);
334 
335 impl CaseMapDataULE {
336     // 1..0 case type
337     const CASE_TYPE_BITS: u16 = 0x3;
338     // 2 case-ignorable
339     const CASE_IGNOREABLE_BIT: u16 = 0x4;
340     // 3 exception
341     const EXCEPTION_BIT: u16 = 0x8;
342     // 4 case-sensitive
343     const CASE_SENSITIVE_BIT: u16 = 0x10;
344     // 15..4 unsigned exception index
345     const EXCEPTION_SHIFT: u16 = 4;
346     // 15..7 signed-delta to simple case mapping code point (or reserved)
347     const DELTA_SHIFT: u16 = 7;
348     // 6..5 dot type
349     const DOT_TYPE_BITS: u16 = 0x60;
350     const DOT_SHIFT: u16 = 5;
351 }
352 
353 /// # Safety
354 ///
355 /// Safety checklist for `ULE`:
356 ///
357 /// 1. The type *must not* include any uninitialized or padding bytes: repr(transparent)
358 ///    wrapper around ULE type
359 /// 2. The type must have an alignment of 1 byte: repr(transparent) wrapper around ULE type
360 /// 3. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice
361 ///    would not represent a valid slice of this type: It does
362 /// 4. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice
363 ///    cannot be used in its entirety (if its length is not a multiple of `size_of::<Self>()`):
364 ///    it does, due to the RawBytesULE parse call
365 /// 5. All other methods *must* be left with their default impl, or else implemented according to
366 ///    their respective safety guidelines: They have been
367 /// 6. The equality invariant is satisfied
368 unsafe impl ULE for CaseMapDataULE {
validate_bytes(bytes: &[u8]) -> Result<(), UleError>369     fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
370         let sixteens = RawBytesULE::<2>::parse_bytes_to_slice(bytes)?;
371 
372         for sixteen in sixteens {
373             let sixteen = sixteen.as_unsigned_int();
374             // The type has reserved bits in the
375             // uncased + not exception case
376             if sixteen & Self::EXCEPTION_BIT == 0 {
377                 // not an exception
378                 if sixteen & Self::CASE_TYPE_BITS == 0 {
379                     // uncased
380                     if sixteen >> Self::DELTA_SHIFT != 0 {
381                         // We have some used bits in the reserved zone!
382                         return Err(UleError::parse::<Self>());
383                     }
384                 }
385             }
386         }
387         Ok(())
388     }
389 }
390 
391 impl AsULE for CaseMapData {
392     type ULE = CaseMapDataULE;
393 
from_unaligned(ule: Self::ULE) -> Self394     fn from_unaligned(ule: Self::ULE) -> Self {
395         let sixteen = ule.0.as_unsigned_int();
396 
397         let ignoreable = (sixteen & CaseMapDataULE::CASE_IGNOREABLE_BIT) != 0;
398         let exception = (sixteen & CaseMapDataULE::EXCEPTION_BIT) != 0;
399 
400         let case_type = sixteen & CaseMapDataULE::CASE_TYPE_BITS;
401         let case_type = CaseType::from_masked_bits(case_type);
402         let kind = if exception {
403             // No need to mask first since the exception bits start at 15
404             let exception = sixteen >> CaseMapDataULE::EXCEPTION_SHIFT;
405             CaseMapDataKind::Exception(case_type, exception)
406         } else {
407             let dot_type = (sixteen & CaseMapDataULE::DOT_TYPE_BITS) >> CaseMapDataULE::DOT_SHIFT;
408             let dot_type = DotType::from_masked_bits(dot_type);
409             let sensitive = (sixteen & CaseMapDataULE::CASE_SENSITIVE_BIT) != 0;
410             let ned = NonExceptionData {
411                 dot_type,
412                 sensitive,
413             };
414             if let Some(case_type) = case_type {
415                 // no need to mask first since the delta bits start at 15
416                 // We can also cast as i16 first so we do not have to
417                 // sign-extend later
418                 let delta = (sixteen as i16) >> CaseMapDataULE::DELTA_SHIFT;
419                 CaseMapDataKind::Delta(ned, case_type, delta)
420             } else {
421                 CaseMapDataKind::Uncased(ned)
422             }
423         };
424         CaseMapData { ignoreable, kind }
425     }
426 
to_unaligned(self) -> Self::ULE427     fn to_unaligned(self) -> Self::ULE {
428         let mut sixteen = 0;
429         if self.ignoreable {
430             sixteen |= CaseMapDataULE::CASE_IGNOREABLE_BIT;
431         }
432         match self.kind {
433             CaseMapDataKind::Exception(case_type, e) => {
434                 sixteen |= CaseMapDataULE::EXCEPTION_BIT;
435                 sixteen |= e << CaseMapDataULE::EXCEPTION_SHIFT;
436                 sixteen |= case_type.map(|c| c as u16).unwrap_or(0);
437             }
438             CaseMapDataKind::Uncased(ned) => {
439                 sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT;
440                 if ned.sensitive {
441                     sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT;
442                 }
443                 // Remaining bytes are left at zero
444                 // case_type is Uncased (0)
445             }
446             CaseMapDataKind::Delta(ned, case_type, delta) => {
447                 // First shift (which keeps the signedness), then cast to the
448                 // right type
449                 sixteen |= (delta << CaseMapDataULE::DELTA_SHIFT) as u16;
450                 sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT;
451                 if ned.sensitive {
452                     sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT;
453                 }
454                 sixteen |= case_type as u16;
455             }
456         }
457         CaseMapDataULE(sixteen.to_unaligned())
458     }
459 }
460 
461 #[cfg(test)]
462 mod tests {
463     use super::*;
464 
465     #[test]
test_roundtrip()466     fn test_roundtrip() {
467         const TESTCASES: &[CaseMapData] = &[
468             CaseMapData {
469                 ignoreable: true,
470                 kind: CaseMapDataKind::Exception(Some(CaseType::Title), 923),
471             },
472             CaseMapData {
473                 ignoreable: false,
474                 kind: CaseMapDataKind::Exception(None, 923),
475             },
476             CaseMapData {
477                 ignoreable: true,
478                 kind: CaseMapDataKind::Delta(
479                     NonExceptionData {
480                         sensitive: true,
481                         dot_type: DotType::SoftDotted,
482                     },
483                     CaseType::Upper,
484                     50,
485                 ),
486             },
487             CaseMapData {
488                 ignoreable: false,
489                 kind: CaseMapDataKind::Delta(
490                     NonExceptionData {
491                         sensitive: true,
492                         dot_type: DotType::SoftDotted,
493                     },
494                     CaseType::Upper,
495                     -50,
496                 ),
497             },
498             CaseMapData {
499                 ignoreable: false,
500                 kind: CaseMapDataKind::Uncased(NonExceptionData {
501                     sensitive: false,
502                     dot_type: DotType::SoftDotted,
503                 }),
504             },
505         ];
506 
507         for case in TESTCASES {
508             let ule = case.to_unaligned();
509             let roundtrip = CaseMapData::from_unaligned(ule);
510             assert_eq!(*case, roundtrip);
511             let integer = ule.0.as_unsigned_int();
512             let roundtrip2 = CaseMapData::try_from_icu_integer(integer).unwrap();
513             assert_eq!(*case, roundtrip2);
514         }
515     }
516     #[test]
test_integer_roundtrip()517     fn test_integer_roundtrip() {
518         // Buggy roundtrip cases go here
519         fn test_single_integer(int: u16) {
520             let cmd = CaseMapData::try_from_icu_integer(int).unwrap();
521             assert_eq!(int, cmd.to_unaligned().0.as_unsigned_int())
522         }
523 
524         test_single_integer(84);
525         test_single_integer(2503);
526     }
527 }
528