1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! This module contains various types for the header part of casemapping exception data 6 //! 7 //! This is both used in datagen to decode ICU4C's data, and natively in ICU4X's 8 //! own data model. 9 //! 10 //! [`ExceptionBits`] is the bag of bits associated with exceptions, and [`SlotPresence`] 11 //! marks the presence or absence of various "slots" in a given exception. 12 //! 13 //! The `exceptions_builder` module of this crate handles decoding ICU4C data using the exception 14 //! header, and [`crate::provider::exceptions`] handles. 15 16 use crate::provider::data::{DotType, MappingKind}; 17 use zerovec::ule::{AsULE, ULE}; 18 19 /// A bunch of bits associated with each exception. 20 /// 21 /// <div class="stab unstable"> 22 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 23 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 24 /// to be stable, their Rust representation might not be. Use with caution. 25 /// </div> 26 #[derive(Copy, Clone, PartialEq, Eq, Debug, Default)] 27 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 28 #[cfg_attr(feature = "datagen", derive(serde::Serialize))] 29 pub struct ExceptionBits { 30 /// Whether or not the slots are double-width. 31 /// 32 /// Unused in ICU4X 33 pub double_width_slots: bool, 34 /// There is no simple casefolding, even if there is a simple lowercase mapping 35 pub no_simple_case_folding: bool, 36 /// The delta stored in the `Delta` slot is negative 37 pub negative_delta: bool, 38 /// If the character is case sensitive 39 pub is_sensitive: bool, 40 /// The dot type of the character 41 pub dot_type: DotType, 42 /// If the character has conditional special casing 43 pub has_conditional_special: bool, 44 /// If the character has conditional case folding 45 pub has_conditional_fold: bool, 46 } 47 48 impl ExceptionBits { 49 /// Extract from the upper half of an ICU4C-format u16 from_integer(int: u8) -> Self50 pub(crate) fn from_integer(int: u8) -> Self { 51 let ule = ExceptionBitsULE(int); 52 let double_width_slots = ule.double_width_slots(); 53 let no_simple_case_folding = ule.no_simple_case_folding(); 54 let negative_delta = ule.negative_delta(); 55 let is_sensitive = ule.is_sensitive(); 56 let has_conditional_special = ule.has_conditional_special(); 57 let has_conditional_fold = ule.has_conditional_fold(); 58 let dot_type = ule.dot_type(); 59 60 Self { 61 double_width_slots, 62 no_simple_case_folding, 63 negative_delta, 64 is_sensitive, 65 dot_type, 66 has_conditional_special, 67 has_conditional_fold, 68 } 69 } 70 71 /// Convert to an ICU4C-format upper half of u16 to_integer(self) -> u872 pub(crate) fn to_integer(self) -> u8 { 73 let mut int = 0; 74 let dot_data = (self.dot_type as u8) << ExceptionBitsULE::DOT_SHIFT; 75 int |= dot_data; 76 77 if self.double_width_slots { 78 int |= ExceptionBitsULE::DOUBLE_SLOTS_FLAG 79 } 80 if self.no_simple_case_folding { 81 int |= ExceptionBitsULE::NO_SIMPLE_CASE_FOLDING_FLAG 82 } 83 if self.negative_delta { 84 int |= ExceptionBitsULE::NEGATIVE_DELTA_FLAG 85 } 86 if self.is_sensitive { 87 int |= ExceptionBitsULE::SENSITIVE_FLAG 88 } 89 if self.has_conditional_special { 90 int |= ExceptionBitsULE::CONDITIONAL_SPECIAL_FLAG 91 } 92 if self.has_conditional_fold { 93 int |= ExceptionBitsULE::CONDITIONAL_FOLD_FLAG 94 } 95 int 96 } 97 } 98 99 /// Packed slot presence marker 100 /// 101 /// All bits are valid, though bit 4 is unused and reserved 102 /// 103 /// Bits: 104 /// 105 /// ```text 106 /// 0: Lowercase mapping (code point) 107 /// 1: Case folding (code point) 108 /// 2: Uppercase mapping (code point) 109 /// 3: Titlecase mapping (code point) 110 /// 4: Delta to simple case mapping (code point) (sign stored separately) 111 /// 5: RESERVED 112 /// 6: Closure mappings (string; see below) 113 /// 7: Full mappings (strings; see below) 114 /// ``` 115 /// 116 /// <div class="stab unstable"> 117 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 118 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 119 /// to be stable, their Rust representation might not be. Use with caution. 120 /// </div> 121 #[derive(Copy, Clone, PartialEq, Eq, ULE, Debug, Default)] 122 #[repr(transparent)] 123 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 124 #[cfg_attr(feature = "datagen", derive(serde::Serialize))] 125 pub struct SlotPresence(pub u8); 126 127 impl SlotPresence { add_slot(&mut self, slot: ExceptionSlot)128 pub(crate) fn add_slot(&mut self, slot: ExceptionSlot) { 129 self.0 |= 1 << slot as u8; 130 } has_slot(self, slot: ExceptionSlot) -> bool131 pub(crate) fn has_slot(self, slot: ExceptionSlot) -> bool { 132 let bit = 1 << (slot as u8); 133 self.0 & bit != 0 134 } 135 } 136 137 /// The bitflags on an exception header. 138 /// 139 /// Format from icu4c, documented in casepropsbuilder.cpp, shifted 8 bits since ICU4C has this packed 140 /// alongside a SlotPresence 141 /// 142 /// ```text 143 /// 0 Double-width slots. If set, then each optional slot is stored as two 144 /// elements of the array (high and low halves of 32-bit values) instead of 145 /// a single element. 146 /// 1 Has no simple case folding, even if there is a simple lowercase mapping 147 /// 2 The value in the delta slot is negative 148 /// 3 Is case-sensitive (not exposed) 149 /// 4..5 Dot type 150 /// 6 Has conditional special casing 151 /// 7 Has conditional case folding 152 /// ``` 153 /// 154 /// All bits are valid, though in ICU4X data bits 0 and 2 are not used 155 /// 156 /// <div class="stab unstable"> 157 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 158 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 159 /// to be stable, their Rust representation might not be. Use with caution. 160 /// </div> 161 #[derive(Copy, Clone, PartialEq, Eq, ULE, Debug)] 162 #[repr(transparent)] 163 pub struct ExceptionBitsULE(pub u8); 164 165 impl ExceptionBitsULE { 166 const DOUBLE_SLOTS_FLAG: u8 = 0x1; 167 168 const NO_SIMPLE_CASE_FOLDING_FLAG: u8 = 0x2; 169 const NEGATIVE_DELTA_FLAG: u8 = 0x4; 170 const SENSITIVE_FLAG: u8 = 0x8; 171 172 const DOT_SHIFT: u8 = 4; 173 174 const CONDITIONAL_SPECIAL_FLAG: u8 = 0x40; 175 const CONDITIONAL_FOLD_FLAG: u8 = 0x80; 176 } 177 178 impl ExceptionBitsULE { 179 /// Whether or not the slots are double-width. 180 /// 181 /// Unused in ICU4X double_width_slots(self) -> bool182 pub fn double_width_slots(self) -> bool { 183 self.0 & Self::DOUBLE_SLOTS_FLAG != 0 184 } 185 186 /// There is no simple casefolding, even if there is a simple lowercase mapping no_simple_case_folding(self) -> bool187 pub fn no_simple_case_folding(self) -> bool { 188 self.0 & Self::NO_SIMPLE_CASE_FOLDING_FLAG != 0 189 } 190 191 /// The delta stored in the `Delta` slot is negative negative_delta(self) -> bool192 pub fn negative_delta(self) -> bool { 193 self.0 & Self::NEGATIVE_DELTA_FLAG != 0 194 } 195 196 /// If the character is case sensitive is_sensitive(self) -> bool197 pub fn is_sensitive(self) -> bool { 198 self.0 & Self::SENSITIVE_FLAG != 0 199 } 200 201 /// If the character has conditional special casing has_conditional_special(self) -> bool202 pub fn has_conditional_special(self) -> bool { 203 self.0 & Self::CONDITIONAL_SPECIAL_FLAG != 0 204 } 205 206 /// If the character has conditional case folding has_conditional_fold(self) -> bool207 pub fn has_conditional_fold(self) -> bool { 208 self.0 & Self::CONDITIONAL_FOLD_FLAG != 0 209 } 210 211 /// The dot type of the character dot_type(self) -> DotType212 pub fn dot_type(self) -> DotType { 213 DotType::from_masked_bits((u16::from(self.0 >> Self::DOT_SHIFT)) & DotType::DOT_MASK) 214 } 215 } 216 217 impl AsULE for ExceptionBits { 218 type ULE = ExceptionBitsULE; from_unaligned(u: ExceptionBitsULE) -> Self219 fn from_unaligned(u: ExceptionBitsULE) -> Self { 220 ExceptionBits::from_integer(u.0) 221 } 222 to_unaligned(self) -> ExceptionBitsULE223 fn to_unaligned(self) -> ExceptionBitsULE { 224 ExceptionBitsULE(self.to_integer()) 225 } 226 } 227 228 impl AsULE for SlotPresence { 229 type ULE = SlotPresence; from_unaligned(u: Self) -> Self230 fn from_unaligned(u: Self) -> Self { 231 u 232 } 233 to_unaligned(self) -> Self234 fn to_unaligned(self) -> Self { 235 self 236 } 237 } 238 239 /// The different slots that may be present in slot-based exception data 240 /// 241 /// <div class="stab unstable"> 242 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 243 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 244 /// to be stable, their Rust representation might not be. Use with caution. 245 /// </div> 246 #[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)] 247 pub(crate) enum ExceptionSlot { 248 /// Lowercase mapping 249 Lower = 0, 250 /// Case folding 251 Fold = 1, 252 /// Uppercase mapping 253 Upper = 2, 254 /// Titlecase mapping 255 Title = 3, 256 /// The delta to the simple case folding 257 Delta = 4, 258 // Slot 5 is reserved 259 /// The closure set 260 Closure = 6, 261 /// The four full-mappings 262 FullMappings = 7, 263 } 264 265 impl ExceptionSlot { 266 /// Where the string slots begin 267 pub(crate) const STRING_SLOTS_START: Self = Self::Closure; 268 } 269 270 impl From<MappingKind> for ExceptionSlot { from(full: MappingKind) -> Self271 fn from(full: MappingKind) -> Self { 272 match full { 273 MappingKind::Lower => Self::Lower, 274 MappingKind::Fold => Self::Fold, 275 MappingKind::Upper => Self::Upper, 276 MappingKind::Title => Self::Title, 277 } 278 } 279 } 280