• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! �� \[Unstable\] Data provider struct definitions for this ICU4X component.
6 //!
7 //! <div class="stab unstable">
8 //! �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9 //! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10 //! to be stable, their Rust representation might not be. Use with caution.
11 //! </div>
12 //!
13 //! Read more about data providers: [`icu_provider`]
14 
15 // Provider structs must be stable
16 #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17 
18 use icu_provider::prelude::*;
19 
20 use crate::provider::data::CaseMapData;
21 use crate::provider::exceptions::CaseMapExceptions;
22 use icu_collections::codepointtrie::CodePointTrie;
23 #[cfg(feature = "datagen")]
24 use icu_collections::codepointtrie::CodePointTrieHeader;
25 
26 pub mod data;
27 pub mod exception_helpers;
28 pub mod exceptions;
29 #[cfg(feature = "datagen")]
30 mod exceptions_builder;
31 mod unfold;
32 
33 #[cfg(feature = "compiled_data")]
34 #[derive(Debug)]
35 /// Baked data
36 ///
37 /// <div class="stab unstable">
38 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
39 /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
40 /// guaranteed to match with this version's `*_unstable` providers. Use with caution.
41 /// </div>
42 pub struct Baked;
43 
44 #[cfg(feature = "compiled_data")]
45 #[allow(unused_imports)]
46 const _: () = {
47     use icu_casemap_data::*;
48     pub mod icu {
49         pub use crate as casemap;
50         pub use icu_collections as collections;
51     }
52     make_provider!(Baked);
53     impl_case_map_v1!(Baked);
54     impl_case_map_unfold_v1!(Baked);
55 };
56 
57 icu_provider::data_marker!(
58     /// Marker for casemapping data.
59     CaseMapV1,
60     "case/map/v1",
61     CaseMap<'static>,
62     is_singleton = true
63 );
64 
65 icu_provider::data_marker!(
66     /// Reverse case mapping data.
67     CaseMapUnfoldV1,
68     "case/map/unfold/v1",
69     CaseMapUnfold<'static>,
70     is_singleton = true
71 );
72 
73 #[cfg(feature = "datagen")]
74 /// The latest minimum set of markers required by this component.
75 pub const MARKERS: &[DataMarkerInfo] = &[CaseMapUnfoldV1::INFO, CaseMapV1::INFO];
76 
77 pub use self::unfold::CaseMapUnfold;
78 
79 /// This type contains all of the casemapping data
80 ///
81 /// The methods in the provider module are primarily about accessing its data,
82 /// however the full algorithms are also implemented as methods on this type in
83 /// the `internals` module of this crate.
84 ///
85 /// <div class="stab unstable">
86 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
87 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
88 /// to be stable, their Rust representation might not be. Use with caution.
89 /// </div>
90 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
91 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
92 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider))]
93 #[yoke(prove_covariance_manually)]
94 /// CaseMapper provides low-level access to the data necessary to
95 /// convert characters and strings to upper, lower, or title case.
96 pub struct CaseMap<'data> {
97     /// Case mapping data
98     pub trie: CodePointTrie<'data, CaseMapData>,
99     /// Exceptions to the case mapping data
100     pub exceptions: CaseMapExceptions<'data>,
101 }
102 
103 icu_provider::data_struct!(
104     CaseMap<'_>,
105     #[cfg(feature = "datagen")]
106 );
107 
108 #[cfg(feature = "serde")]
109 impl<'de> serde::Deserialize<'de> for CaseMap<'de> {
deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error>110     fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
111         #[derive(serde::Deserialize)]
112         pub struct Raw<'data> {
113             #[serde(borrow)]
114             pub trie: CodePointTrie<'data, CaseMapData>,
115             #[serde(borrow)]
116             pub exceptions: CaseMapExceptions<'data>,
117         }
118 
119         let Raw { trie, exceptions } = Raw::deserialize(deserializer)?;
120         let result = Self { trie, exceptions };
121         debug_assert!(result.validate().is_ok());
122         Ok(result)
123     }
124 }
125 
126 impl CaseMap<'_> {
127     /// Creates a new CaseMap using data exported by the
128     // `icuexportdata` tool in ICU4C. Validates that the data is
129     // consistent.
130     #[cfg(feature = "datagen")]
try_from_icu( trie_header: CodePointTrieHeader, trie_index: &[u16], trie_data: &[u16], exceptions: &[u16], ) -> Result<Self, DataError>131     pub fn try_from_icu(
132         trie_header: CodePointTrieHeader,
133         trie_index: &[u16],
134         trie_data: &[u16],
135         exceptions: &[u16],
136     ) -> Result<Self, DataError> {
137         use self::exceptions_builder::CaseMapExceptionsBuilder;
138         use zerovec::ZeroVec;
139         let exceptions_builder = CaseMapExceptionsBuilder::new(exceptions);
140         let (exceptions, idx_map) = exceptions_builder.build()?;
141 
142         let trie_index = ZeroVec::alloc_from_slice(trie_index);
143 
144         #[allow(clippy::unwrap_used)] // datagen only
145         let trie_data = trie_data
146             .iter()
147             .map(|&i| {
148                 CaseMapData::try_from_icu_integer(i)
149                     .unwrap()
150                     .with_updated_exception(&idx_map)
151             })
152             .collect::<ZeroVec<_>>();
153 
154         let trie = CodePointTrie::try_new(trie_header, trie_index, trie_data)
155             .map_err(|_| DataError::custom("Casemapping data does not form valid trie"))?;
156 
157         let result = Self { trie, exceptions };
158         result.validate().map_err(DataError::custom)?;
159         Ok(result)
160     }
161 
162     /// Given an existing CaseMapper, validates that the data is
163     /// consistent. A CaseMapper created by the ICU transformer has
164     /// already been validated. Calling this function is only
165     /// necessary if you are concerned about data corruption after
166     /// deserializing.
167     #[cfg(any(feature = "serde", feature = "datagen"))]
168     #[allow(unused)] // is only used in debug mode for serde
validate(&self) -> Result<(), &'static str>169     pub(crate) fn validate(&self) -> Result<(), &'static str> {
170         // First, validate that exception data is well-formed.
171         let valid_exception_indices = self.exceptions.validate()?;
172 
173         let validate_delta = |c: char, delta: i32| -> Result<(), &'static str> {
174             let new_c =
175                 u32::try_from(c as i32 + delta).map_err(|_| "Delta larger than character")?;
176             char::from_u32(new_c).ok_or("Invalid delta")?;
177             Ok(())
178         };
179 
180         for i in 0..char::MAX as u32 {
181             if let Some(c) = char::from_u32(i) {
182                 let data = self.lookup_data(c);
183                 if data.has_exception() {
184                     let idx = data.exception_index();
185                     let exception = self.exceptions.get(idx);
186                     // Verify that the exception index points to a valid exception header.
187                     if !valid_exception_indices.contains(&idx) {
188                         return Err("Invalid exception index in trie data");
189                     }
190                     exception.validate()?;
191                 } else {
192                     validate_delta(c, data.delta() as i32)?;
193                 }
194             }
195         }
196         Ok(())
197     }
198 
lookup_data(&self, c: char) -> CaseMapData199     pub(crate) fn lookup_data(&self, c: char) -> CaseMapData {
200         self.trie.get32(c as u32)
201     }
202 }
203