// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::provider::exception_helpers::{ ExceptionBits, ExceptionBitsULE, ExceptionSlot, SlotPresence, }; use crate::provider::exceptions::{CaseMapExceptions, DecodedException}; use alloc::borrow::Cow; use alloc::collections::BTreeMap; use alloc::string::String; use alloc::vec::Vec; use icu_provider::DataError; use zerovec::ule::{AsULE, ULE}; /// The header for exception types as found in ICU4C data. See [`ExceptionHeaderULE`] /// for the wire format #[derive(Copy, Clone, PartialEq, Eq)] pub struct ExceptionHeader { /// The various slots that are present, masked by ExceptionSlot /// /// We still store this as a bitmask since it's more convenient to access as one pub slot_presence: SlotPresence, pub bits: ExceptionBits, } impl ExceptionHeader { /// Construct from an ICU4C-format u16. pub(crate) fn from_integer(int: u16) -> Self { let slot_presence = SlotPresence(u8::try_from(int & ExceptionHeaderULE::SLOTS_MASK).unwrap_or(0)); let bits = ExceptionBits::from_integer( u8::try_from(int >> ExceptionHeaderULE::BITS_SHIFT).unwrap_or(0), ); Self { slot_presence, bits, } } // Returns true if the given slot exists for this exception pub(crate) fn has_slot(self, slot: ExceptionSlot) -> bool { self.slot_presence.has_slot(slot) } } /// Packed exception header (format from icu4c, documented in casepropsbuilder.cpp) /// /// ```text /// Bits: /// 0..7 Flag bits indicating which optional slots are present (if any): /// 0: Lowercase mapping (code point) /// 1: Case folding (code point) /// 2: Uppercase mapping (code point) /// 3: Titlecase mapping (code point) /// 4: Delta to simple case mapping (code point) (sign stored separately) /// 5: RESERVED /// 6: Closure mappings (string; see below) /// 7: Full mappings (strings; see below) /// 8 Double-width slots. If set, then each optional slot is stored as two /// elements of the array (high and low halves of 32-bit values) instead of /// a single element. /// 9 Has no simple case folding, even if there is a simple lowercase mapping /// 10 The value in the delta slot is negative /// 11 Is case-sensitive (not exposed) /// 12..13 Dot type /// 14 Has conditional special casing /// 15 Has conditional case folding /// ``` /// /// In this struct the RESERVED bit is still allowed to be set, and it will produce a different /// exception header, but it will not have any other effects. #[derive(Copy, Clone, PartialEq, Eq, ULE)] #[repr(C, packed)] pub struct ExceptionHeaderULE { slot_presence: SlotPresence, bits: ExceptionBitsULE, } impl ExceptionHeaderULE { const SLOTS_MASK: u16 = 0xff; const BITS_SHIFT: u16 = 8; } impl AsULE for ExceptionHeader { type ULE = ExceptionHeaderULE; fn from_unaligned(u: ExceptionHeaderULE) -> Self { Self { slot_presence: u.slot_presence, bits: ExceptionBits::from_integer(u.bits.0), } } fn to_unaligned(self) -> ExceptionHeaderULE { ExceptionHeaderULE { slot_presence: self.slot_presence, bits: ExceptionBitsULE(self.bits.to_integer()), } } } // CaseMapExceptionsBuilder consumes the exceptions data produced by // casepropsbuilder.cpp in ICU4C. It generates an instance of CaseMapExceptions. The // primary difference is that the ICU4C representation stores full mapping and closure // strings inline in the data, while CaseMapExceptions uses a side table. As a result, // the starting index of each exception in the resulting CaseMapExceptions may have // changed, so we also produce a map from old indices to new indices that will be used to // update the data stored in the code point trie. pub struct CaseMapExceptionsBuilder<'a> { raw_data: &'a [u16], raw_data_idx: usize, double_slots: bool, } impl<'a> CaseMapExceptionsBuilder<'a> { const MAPPINGS_ALL_LENGTHS_MASK: u32 = 0xffff; const FULL_MAPPINGS_LENGTH_MASK: u32 = 0xf; const FULL_MAPPINGS_LENGTH_SHIFT: u32 = 4; const CLOSURE_MAX_LENGTH: u32 = 0xf; pub fn new(raw_data: &'a [u16]) -> Self { Self { raw_data, raw_data_idx: 0, double_slots: false, } } fn done(&self) -> bool { self.raw_data_idx >= self.raw_data.len() } fn read_raw(&mut self) -> Result { let result = self .raw_data .get(self.raw_data_idx) .ok_or(DataError::custom("Incomplete exception data"))?; self.raw_data_idx += 1; Ok(*result) } fn read_slot(&mut self) -> Result { if self.double_slots { let hi = self.read_raw()? as u32; let lo = self.read_raw()? as u32; Ok((hi << 16) | lo) } else { Ok(self.read_raw()? as u32) } } // After reading a string out of the raw data, advance raw_data_idx. fn skip_string(&mut self, s: &str) { for c in s.chars() { self.raw_data_idx += c.len_utf16(); } } pub(crate) fn build( mut self, ) -> Result<(CaseMapExceptions<'static>, BTreeMap), DataError> { let mut exceptions = Vec::new(); let mut idx_map = BTreeMap::new(); // The format of the raw data from ICU4C is the same as the format described in // exceptions.rs, with the exception of full mapping and closure strings. The // header and non-string slots can be copied over without modification. For string // slots, we read the length information from the ICU4C slot (described below), // read the strings, add the strings to the CaseMapExceptions string table, // and write an updated slot value containing the index of the string in the // table. In the case of full mappings, we store the index of the lowercase // mapping; the remaining mappings are stored at sequential indices. // // Full mappings: If there is at least one full (string) case mapping, then the // lengths of the mappings are encoded as nibbles in the full mappings slot: // Bits: // 0..4 Length of lowercase string // 5..7 Length of case folding string // 8..11 Length of uppercase string // 12..15 Length of titlecase string // Mappings that do not exist have length 0. The strings themselves are stored in // the above order immediately following the last optional slot, encoded as UTF16. // // Case closure: If the case closure for a code point includes code points that // are not included in the simple or full mappings, then bits 0..3 of the closure // mappings slot will contain the number of codepoints in the closure string. // (Other bits are reserved.) The closure string itself is encoded as UTF16 and // stored following the full mappings data (if it exists) or the final optional // slot. while !self.done() { let old_idx = self.raw_data_idx as u16; let mut exception = DecodedException::default(); // Copy header. let header = ExceptionHeader::from_integer(self.read_raw()?); self.double_slots = header.bits.double_width_slots; // Copy unmodified slots. for (slot, output) in [ (ExceptionSlot::Lower, &mut exception.lowercase), (ExceptionSlot::Fold, &mut exception.casefold), (ExceptionSlot::Upper, &mut exception.uppercase), (ExceptionSlot::Title, &mut exception.titlecase), ] { if header.has_slot(slot) { let value = self.read_slot()?; if let Ok(ch) = char::try_from(value) { *output = Some(ch) } else { return Err(DataError::custom( "Found non-char value in casemapping exceptions data", )); } } } if header.has_slot(ExceptionSlot::Delta) { let delta = self.read_slot()?; exception.simple_case_delta = Some(delta) } // Read the closure and full mappings slots, if they exist. let closure_length = if header.has_slot(ExceptionSlot::Closure) { Some((self.read_slot()? & Self::CLOSURE_MAX_LENGTH) as usize) } else { None }; let mappings_lengths = if header.has_slot(ExceptionSlot::FullMappings) { Some(self.read_slot()? & Self::MAPPINGS_ALL_LENGTHS_MASK) } else { None }; // Copy the full mappings strings into the strings table, if they exist. if let Some(mut lengths) = mappings_lengths { let mut arr: [Cow<_>; 4] = Default::default(); for mapping in &mut arr { let len = lengths & Self::FULL_MAPPINGS_LENGTH_MASK; lengths >>= Self::FULL_MAPPINGS_LENGTH_SHIFT; let start = self.raw_data_idx; let end = start + len as usize; let slice = &self .raw_data .get(start..end) .ok_or(DataError::custom("Incomplete string data"))?; let string = char::decode_utf16(slice.iter().copied()) .collect::>() .map_err(|_| DataError::custom("Found non-utf16 exceptions data"))?; self.skip_string(&string); *mapping = string.into() } exception.full = Some(arr) } // Copy the closure string into the strings table, if it exists. if let Some(len) = closure_length { let start = self.raw_data_idx; let slice = &self .raw_data .get(start..) .ok_or(DataError::custom("Incomplete string data"))?; let string = char::decode_utf16(slice.iter().copied()) .take(len) .collect::>() .map_err(|_| DataError::custom("Found non-utf16 exceptions data"))?; self.skip_string(&string); exception.closure = Some(string.into()) } exception.bits = header.bits; // unused bits in ICU4X exception.bits.double_width_slots = false; let new_exception_index = if let Ok(idx) = u16::try_from(exceptions.len()) { idx } else { return Err(DataError::custom("More than u16 exceptions")); }; idx_map.insert(old_idx, new_exception_index); exceptions.push(exception.encode()); } Ok(( CaseMapExceptions { exceptions: (&exceptions).into(), }, idx_map, )) } }