1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! Data for reverse folding 6 7 #[cfg(feature = "datagen")] 8 use alloc::string::String; 9 use icu_provider::prelude::*; 10 use potential_utf::PotentialUtf8; 11 use zerovec::ZeroMap; 12 13 /// Reverse case folding data. Maps from multi-character strings back 14 /// to code-points that fold to those strings. 15 /// 16 /// <div class="stab unstable"> 17 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 18 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 19 /// to be stable, their Rust representation might not be. Use with caution. 20 /// </div> 21 #[cfg_attr(feature = "serde", derive(serde::Deserialize))] 22 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 23 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider))] 24 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] 25 #[yoke(prove_covariance_manually)] 26 pub struct CaseMapUnfold<'data> { 27 #[cfg_attr(feature = "serde", serde(borrow))] 28 /// The actual map. Maps from strings to a list of codepoints, stored as a contiguous UTF-8 string 29 pub map: ZeroMap<'data, PotentialUtf8, str>, 30 } 31 32 icu_provider::data_struct!( 33 CaseMapUnfold<'_>, 34 #[cfg(feature = "datagen")] 35 ); 36 37 impl CaseMapUnfold<'_> { 38 /// Creates a new CaseMapUnfold using data exported by the `icuexportdata` tool in ICU4C. 39 /// 40 /// Unfold data is exported by ICU as an array of 16-bit values, representing a short 41 /// header followed by a two-column key/value table. The header indicates: 42 /// - The number of rows. 43 /// - The number of UTF16 code units per row. 44 /// - The number of UTF16 code units in the first (key) column. 45 /// (The number of code units in the value column can be derived from the above.) 46 /// 47 /// The key in the first column is the case folding of each of the code points in 48 /// the second column. Keys/values that are shorter than the column width are 49 /// null-terminated. The table is sorted by key. Binary search is used to find the value. 50 /// 51 /// Rust strings are UTF8 by default. To avoid the cost of converting from UTF16 on access, 52 /// we convert the ICU data into a more convenient format during construction. 53 #[cfg(feature = "datagen")] 54 #[allow(clippy::indexing_slicing)] // panics are ok in datagen try_from_icu(raw: &[u16]) -> Result<Self, DataError>55 pub fn try_from_icu(raw: &[u16]) -> Result<Self, DataError> { 56 const ROWS_INDEX: usize = 0; 57 const ROW_WIDTH_INDEX: usize = 1; 58 const STRING_WIDTH_INDEX: usize = 2; 59 60 if raw.len() <= STRING_WIDTH_INDEX { 61 return Err(DataError::custom("Unfold: header missing")); 62 } 63 64 let num_rows = raw[ROWS_INDEX] as usize; 65 let row_width = raw[ROW_WIDTH_INDEX] as usize; 66 let string_width = raw[STRING_WIDTH_INDEX] as usize; 67 68 if row_width == 0 { 69 return Err(DataError::custom("Unfold: invalid row width")); 70 } 71 72 // Header takes up one row. 73 let row_data = &raw[row_width..]; 74 75 let mut map = ZeroMap::new(); 76 77 debug_assert!(num_rows == row_data.chunks_exact(row_width).count()); 78 for row in row_data.chunks_exact(row_width) { 79 let key = Self::decode_string(&row[..string_width]) 80 .ok_or(DataError::custom("Unfold: unpaired surrogate in key"))?; 81 let val = Self::decode_string(&row[string_width..]) 82 .ok_or(DataError::custom("Unfold: unpaired surrogate in value"))?; 83 if map 84 .try_append(PotentialUtf8::from_str(&key), val.as_ref()) 85 .is_some() 86 { 87 return Err(DataError::custom("Unfold: keys not sorted/unique")); 88 } 89 } 90 Ok(Self { map }) 91 } 92 93 // Decode a zero-terminated UTF16 string from a slice of u16. 94 #[cfg(feature = "datagen")] decode_string(slice: &[u16]) -> Option<String>95 pub(crate) fn decode_string(slice: &[u16]) -> Option<String> { 96 let iter = slice.iter().copied().take_while(|&c| c != 0); 97 char::decode_utf16(iter).collect::<Result<String, _>>().ok() 98 } 99 100 // Given a string, returns another string representing the set of characters 101 // that case fold to that string. get(&self, key: &str) -> Option<&str>102 pub(crate) fn get(&self, key: &str) -> Option<&str> { 103 self.map.get(PotentialUtf8::from_str(key)) 104 } 105 } 106