• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! This module contains various types for the header part of casemapping exception data
6 //!
7 //! This is both used in datagen to decode ICU4C's data, and natively in ICU4X's
8 //! own data model.
9 //!
10 //! [`ExceptionBits`] is the bag of bits associated with exceptions, and [`SlotPresence`]
11 //! marks the presence or absence of various "slots" in a given exception.
12 //!
13 //! The `exceptions_builder` module of this crate handles decoding ICU4C data using the exception
14 //! header, and [`crate::provider::exceptions`] handles.
15 
16 use crate::provider::data::{DotType, MappingKind};
17 use zerovec::ule::{AsULE, ULE};
18 
19 /// A bunch of bits associated with each exception.
20 ///
21 /// <div class="stab unstable">
22 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
23 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
24 /// to be stable, their Rust representation might not be. Use with caution.
25 /// </div>
26 #[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
27 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
28 #[cfg_attr(feature = "datagen", derive(serde::Serialize))]
29 pub struct ExceptionBits {
30     /// Whether or not the slots are double-width.
31     ///
32     /// Unused in ICU4X
33     pub double_width_slots: bool,
34     /// There is no simple casefolding, even if there is a simple lowercase mapping
35     pub no_simple_case_folding: bool,
36     /// The delta stored in the `Delta` slot is negative
37     pub negative_delta: bool,
38     /// If the character is case sensitive
39     pub is_sensitive: bool,
40     /// The dot type of the character
41     pub dot_type: DotType,
42     /// If the character has conditional special casing
43     pub has_conditional_special: bool,
44     /// If the character has conditional case folding
45     pub has_conditional_fold: bool,
46 }
47 
48 impl ExceptionBits {
49     /// Extract from the upper half of an ICU4C-format u16
from_integer(int: u8) -> Self50     pub(crate) fn from_integer(int: u8) -> Self {
51         let ule = ExceptionBitsULE(int);
52         let double_width_slots = ule.double_width_slots();
53         let no_simple_case_folding = ule.no_simple_case_folding();
54         let negative_delta = ule.negative_delta();
55         let is_sensitive = ule.is_sensitive();
56         let has_conditional_special = ule.has_conditional_special();
57         let has_conditional_fold = ule.has_conditional_fold();
58         let dot_type = ule.dot_type();
59 
60         Self {
61             double_width_slots,
62             no_simple_case_folding,
63             negative_delta,
64             is_sensitive,
65             dot_type,
66             has_conditional_special,
67             has_conditional_fold,
68         }
69     }
70 
71     /// Convert to an ICU4C-format upper half of u16
to_integer(self) -> u872     pub(crate) fn to_integer(self) -> u8 {
73         let mut int = 0;
74         let dot_data = (self.dot_type as u8) << ExceptionBitsULE::DOT_SHIFT;
75         int |= dot_data;
76 
77         if self.double_width_slots {
78             int |= ExceptionBitsULE::DOUBLE_SLOTS_FLAG
79         }
80         if self.no_simple_case_folding {
81             int |= ExceptionBitsULE::NO_SIMPLE_CASE_FOLDING_FLAG
82         }
83         if self.negative_delta {
84             int |= ExceptionBitsULE::NEGATIVE_DELTA_FLAG
85         }
86         if self.is_sensitive {
87             int |= ExceptionBitsULE::SENSITIVE_FLAG
88         }
89         if self.has_conditional_special {
90             int |= ExceptionBitsULE::CONDITIONAL_SPECIAL_FLAG
91         }
92         if self.has_conditional_fold {
93             int |= ExceptionBitsULE::CONDITIONAL_FOLD_FLAG
94         }
95         int
96     }
97 }
98 
99 /// Packed slot presence marker
100 ///
101 /// All bits are valid, though bit 4 is unused and reserved
102 ///
103 /// Bits:
104 ///
105 /// ```text
106 ///               0: Lowercase mapping (code point)
107 ///               1: Case folding (code point)
108 ///               2: Uppercase mapping (code point)
109 ///               3: Titlecase mapping (code point)
110 ///               4: Delta to simple case mapping (code point) (sign stored separately)
111 ///               5: RESERVED
112 ///               6: Closure mappings (string; see below)
113 ///               7: Full mappings (strings; see below)
114 /// ```
115 ///
116 /// <div class="stab unstable">
117 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
118 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
119 /// to be stable, their Rust representation might not be. Use with caution.
120 /// </div>
121 #[derive(Copy, Clone, PartialEq, Eq, ULE, Debug, Default)]
122 #[repr(transparent)]
123 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
124 #[cfg_attr(feature = "datagen", derive(serde::Serialize))]
125 pub struct SlotPresence(pub u8);
126 
127 impl SlotPresence {
add_slot(&mut self, slot: ExceptionSlot)128     pub(crate) fn add_slot(&mut self, slot: ExceptionSlot) {
129         self.0 |= 1 << slot as u8;
130     }
has_slot(self, slot: ExceptionSlot) -> bool131     pub(crate) fn has_slot(self, slot: ExceptionSlot) -> bool {
132         let bit = 1 << (slot as u8);
133         self.0 & bit != 0
134     }
135 }
136 
137 /// The bitflags on an exception header.
138 ///
139 /// Format from icu4c, documented in casepropsbuilder.cpp, shifted 8 bits since ICU4C has this packed
140 /// alongside a SlotPresence
141 ///
142 /// ```text
143 ///            0  Double-width slots. If set, then each optional slot is stored as two
144 ///               elements of the array (high and low halves of 32-bit values) instead of
145 ///               a single element.
146 ///            1  Has no simple case folding, even if there is a simple lowercase mapping
147 ///           2  The value in the delta slot is negative
148 ///           3  Is case-sensitive (not exposed)
149 ///       4..5  Dot type
150 ///           6  Has conditional special casing
151 ///           7  Has conditional case folding
152 /// ```
153 ///
154 /// All bits are valid, though in ICU4X data bits 0 and 2 are not used
155 ///
156 /// <div class="stab unstable">
157 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
158 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
159 /// to be stable, their Rust representation might not be. Use with caution.
160 /// </div>
161 #[derive(Copy, Clone, PartialEq, Eq, ULE, Debug)]
162 #[repr(transparent)]
163 pub struct ExceptionBitsULE(pub u8);
164 
165 impl ExceptionBitsULE {
166     const DOUBLE_SLOTS_FLAG: u8 = 0x1;
167 
168     const NO_SIMPLE_CASE_FOLDING_FLAG: u8 = 0x2;
169     const NEGATIVE_DELTA_FLAG: u8 = 0x4;
170     const SENSITIVE_FLAG: u8 = 0x8;
171 
172     const DOT_SHIFT: u8 = 4;
173 
174     const CONDITIONAL_SPECIAL_FLAG: u8 = 0x40;
175     const CONDITIONAL_FOLD_FLAG: u8 = 0x80;
176 }
177 
178 impl ExceptionBitsULE {
179     /// Whether or not the slots are double-width.
180     ///
181     /// Unused in ICU4X
double_width_slots(self) -> bool182     pub fn double_width_slots(self) -> bool {
183         self.0 & Self::DOUBLE_SLOTS_FLAG != 0
184     }
185 
186     /// There is no simple casefolding, even if there is a simple lowercase mapping
no_simple_case_folding(self) -> bool187     pub fn no_simple_case_folding(self) -> bool {
188         self.0 & Self::NO_SIMPLE_CASE_FOLDING_FLAG != 0
189     }
190 
191     /// The delta stored in the `Delta` slot is negative
negative_delta(self) -> bool192     pub fn negative_delta(self) -> bool {
193         self.0 & Self::NEGATIVE_DELTA_FLAG != 0
194     }
195 
196     /// If the character is case sensitive
is_sensitive(self) -> bool197     pub fn is_sensitive(self) -> bool {
198         self.0 & Self::SENSITIVE_FLAG != 0
199     }
200 
201     /// If the character has conditional special casing
has_conditional_special(self) -> bool202     pub fn has_conditional_special(self) -> bool {
203         self.0 & Self::CONDITIONAL_SPECIAL_FLAG != 0
204     }
205 
206     /// If the character has conditional case folding
has_conditional_fold(self) -> bool207     pub fn has_conditional_fold(self) -> bool {
208         self.0 & Self::CONDITIONAL_FOLD_FLAG != 0
209     }
210 
211     /// The dot type of the character
dot_type(self) -> DotType212     pub fn dot_type(self) -> DotType {
213         DotType::from_masked_bits((u16::from(self.0 >> Self::DOT_SHIFT)) & DotType::DOT_MASK)
214     }
215 }
216 
217 impl AsULE for ExceptionBits {
218     type ULE = ExceptionBitsULE;
from_unaligned(u: ExceptionBitsULE) -> Self219     fn from_unaligned(u: ExceptionBitsULE) -> Self {
220         ExceptionBits::from_integer(u.0)
221     }
222 
to_unaligned(self) -> ExceptionBitsULE223     fn to_unaligned(self) -> ExceptionBitsULE {
224         ExceptionBitsULE(self.to_integer())
225     }
226 }
227 
228 impl AsULE for SlotPresence {
229     type ULE = SlotPresence;
from_unaligned(u: Self) -> Self230     fn from_unaligned(u: Self) -> Self {
231         u
232     }
233 
to_unaligned(self) -> Self234     fn to_unaligned(self) -> Self {
235         self
236     }
237 }
238 
239 /// The different slots that may be present in slot-based exception data
240 ///
241 /// <div class="stab unstable">
242 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
243 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
244 /// to be stable, their Rust representation might not be. Use with caution.
245 /// </div>
246 #[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
247 pub(crate) enum ExceptionSlot {
248     /// Lowercase mapping
249     Lower = 0,
250     /// Case folding
251     Fold = 1,
252     /// Uppercase mapping
253     Upper = 2,
254     /// Titlecase mapping
255     Title = 3,
256     /// The delta to the simple case folding
257     Delta = 4,
258     // Slot 5 is reserved
259     /// The closure set
260     Closure = 6,
261     /// The four full-mappings
262     FullMappings = 7,
263 }
264 
265 impl ExceptionSlot {
266     /// Where the string slots begin
267     pub(crate) const STRING_SLOTS_START: Self = Self::Closure;
268 }
269 
270 impl From<MappingKind> for ExceptionSlot {
from(full: MappingKind) -> Self271     fn from(full: MappingKind) -> Self {
272         match full {
273             MappingKind::Lower => Self::Lower,
274             MappingKind::Fold => Self::Fold,
275             MappingKind::Upper => Self::Upper,
276             MappingKind::Title => Self::Title,
277         }
278     }
279 }
280