• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! Data for reverse folding
6 
7 #[cfg(feature = "datagen")]
8 use alloc::string::String;
9 use icu_provider::prelude::*;
10 use potential_utf::PotentialUtf8;
11 use zerovec::ZeroMap;
12 
13 /// Reverse case folding data. Maps from multi-character strings back
14 /// to code-points that fold to those strings.
15 ///
16 /// <div class="stab unstable">
17 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
18 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
19 /// to be stable, their Rust representation might not be. Use with caution.
20 /// </div>
21 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
22 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
23 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider))]
24 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
25 #[yoke(prove_covariance_manually)]
26 pub struct CaseMapUnfold<'data> {
27     #[cfg_attr(feature = "serde", serde(borrow))]
28     /// The actual map. Maps from strings to a list of codepoints, stored as a contiguous UTF-8 string
29     pub map: ZeroMap<'data, PotentialUtf8, str>,
30 }
31 
32 icu_provider::data_struct!(
33     CaseMapUnfold<'_>,
34     #[cfg(feature = "datagen")]
35 );
36 
37 impl CaseMapUnfold<'_> {
38     /// Creates a new CaseMapUnfold using data exported by the `icuexportdata` tool in ICU4C.
39     ///
40     /// Unfold data is exported by ICU as an array of 16-bit values, representing a short
41     /// header followed by a two-column key/value table. The header indicates:
42     /// - The number of rows.
43     /// - The number of UTF16 code units per row.
44     /// - The number of UTF16 code units in the first (key) column.
45     ///   (The number of code units in the value column can be derived from the above.)
46     ///
47     /// The key in the first column is the case folding of each of the code points in
48     /// the second column. Keys/values that are shorter than the column width are
49     /// null-terminated. The table is sorted by key. Binary search is used to find the value.
50     ///
51     /// Rust strings are UTF8 by default. To avoid the cost of converting from UTF16 on access,
52     /// we convert the ICU data into a more convenient format during construction.
53     #[cfg(feature = "datagen")]
54     #[allow(clippy::indexing_slicing)] // panics are ok in datagen
try_from_icu(raw: &[u16]) -> Result<Self, DataError>55     pub fn try_from_icu(raw: &[u16]) -> Result<Self, DataError> {
56         const ROWS_INDEX: usize = 0;
57         const ROW_WIDTH_INDEX: usize = 1;
58         const STRING_WIDTH_INDEX: usize = 2;
59 
60         if raw.len() <= STRING_WIDTH_INDEX {
61             return Err(DataError::custom("Unfold: header missing"));
62         }
63 
64         let num_rows = raw[ROWS_INDEX] as usize;
65         let row_width = raw[ROW_WIDTH_INDEX] as usize;
66         let string_width = raw[STRING_WIDTH_INDEX] as usize;
67 
68         if row_width == 0 {
69             return Err(DataError::custom("Unfold: invalid row width"));
70         }
71 
72         // Header takes up one row.
73         let row_data = &raw[row_width..];
74 
75         let mut map = ZeroMap::new();
76 
77         debug_assert!(num_rows == row_data.chunks_exact(row_width).count());
78         for row in row_data.chunks_exact(row_width) {
79             let key = Self::decode_string(&row[..string_width])
80                 .ok_or(DataError::custom("Unfold: unpaired surrogate in key"))?;
81             let val = Self::decode_string(&row[string_width..])
82                 .ok_or(DataError::custom("Unfold: unpaired surrogate in value"))?;
83             if map
84                 .try_append(PotentialUtf8::from_str(&key), val.as_ref())
85                 .is_some()
86             {
87                 return Err(DataError::custom("Unfold: keys not sorted/unique"));
88             }
89         }
90         Ok(Self { map })
91     }
92 
93     // Decode a zero-terminated UTF16 string from a slice of u16.
94     #[cfg(feature = "datagen")]
decode_string(slice: &[u16]) -> Option<String>95     pub(crate) fn decode_string(slice: &[u16]) -> Option<String> {
96         let iter = slice.iter().copied().take_while(|&c| c != 0);
97         char::decode_utf16(iter).collect::<Result<String, _>>().ok()
98     }
99 
100     // Given a string, returns another string representing the set of characters
101     // that case fold to that string.
get(&self, key: &str) -> Option<&str>102     pub(crate) fn get(&self, key: &str) -> Option<&str> {
103         self.map.get(PotentialUtf8::from_str(key))
104     }
105 }
106