1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! \[Unstable\] Data provider struct definitions for this ICU4X component. 6 //! 7 //! <div class="stab unstable"> 8 //! This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 9 //! including in SemVer minor releases. While the serde representation of data structs is guaranteed 10 //! to be stable, their Rust representation might not be. Use with caution. 11 //! </div> 12 //! 13 //! Read more about data providers: [`icu_provider`] 14 15 // Provider structs must be stable 16 #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] 17 18 use icu_provider::prelude::*; 19 20 use crate::provider::data::CaseMapData; 21 use crate::provider::exceptions::CaseMapExceptions; 22 use icu_collections::codepointtrie::CodePointTrie; 23 #[cfg(feature = "datagen")] 24 use icu_collections::codepointtrie::CodePointTrieHeader; 25 26 pub mod data; 27 pub mod exception_helpers; 28 pub mod exceptions; 29 #[cfg(feature = "datagen")] 30 mod exceptions_builder; 31 mod unfold; 32 33 #[cfg(feature = "compiled_data")] 34 #[derive(Debug)] 35 /// Baked data 36 /// 37 /// <div class="stab unstable"> 38 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 39 /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only 40 /// guaranteed to match with this version's `*_unstable` providers. Use with caution. 41 /// </div> 42 pub struct Baked; 43 44 #[cfg(feature = "compiled_data")] 45 #[allow(unused_imports)] 46 const _: () = { 47 use icu_casemap_data::*; 48 pub mod icu { 49 pub use crate as casemap; 50 pub use icu_collections as collections; 51 } 52 make_provider!(Baked); 53 impl_case_map_v1!(Baked); 54 impl_case_map_unfold_v1!(Baked); 55 }; 56 57 icu_provider::data_marker!( 58 /// Marker for casemapping data. 59 CaseMapV1, 60 "case/map/v1", 61 CaseMap<'static>, 62 is_singleton = true 63 ); 64 65 icu_provider::data_marker!( 66 /// Reverse case mapping data. 67 CaseMapUnfoldV1, 68 "case/map/unfold/v1", 69 CaseMapUnfold<'static>, 70 is_singleton = true 71 ); 72 73 #[cfg(feature = "datagen")] 74 /// The latest minimum set of markers required by this component. 75 pub const MARKERS: &[DataMarkerInfo] = &[CaseMapUnfoldV1::INFO, CaseMapV1::INFO]; 76 77 pub use self::unfold::CaseMapUnfold; 78 79 /// This type contains all of the casemapping data 80 /// 81 /// The methods in the provider module are primarily about accessing its data, 82 /// however the full algorithms are also implemented as methods on this type in 83 /// the `internals` module of this crate. 84 /// 85 /// <div class="stab unstable"> 86 /// This code is considered unstable; it may change at any time, in breaking or non-breaking ways, 87 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed 88 /// to be stable, their Rust representation might not be. Use with caution. 89 /// </div> 90 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] 91 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] 92 #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider))] 93 #[yoke(prove_covariance_manually)] 94 /// CaseMapper provides low-level access to the data necessary to 95 /// convert characters and strings to upper, lower, or title case. 96 pub struct CaseMap<'data> { 97 /// Case mapping data 98 pub trie: CodePointTrie<'data, CaseMapData>, 99 /// Exceptions to the case mapping data 100 pub exceptions: CaseMapExceptions<'data>, 101 } 102 103 icu_provider::data_struct!( 104 CaseMap<'_>, 105 #[cfg(feature = "datagen")] 106 ); 107 108 #[cfg(feature = "serde")] 109 impl<'de> serde::Deserialize<'de> for CaseMap<'de> { deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error>110 fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> { 111 #[derive(serde::Deserialize)] 112 pub struct Raw<'data> { 113 #[serde(borrow)] 114 pub trie: CodePointTrie<'data, CaseMapData>, 115 #[serde(borrow)] 116 pub exceptions: CaseMapExceptions<'data>, 117 } 118 119 let Raw { trie, exceptions } = Raw::deserialize(deserializer)?; 120 let result = Self { trie, exceptions }; 121 debug_assert!(result.validate().is_ok()); 122 Ok(result) 123 } 124 } 125 126 impl CaseMap<'_> { 127 /// Creates a new CaseMap using data exported by the 128 // `icuexportdata` tool in ICU4C. Validates that the data is 129 // consistent. 130 #[cfg(feature = "datagen")] try_from_icu( trie_header: CodePointTrieHeader, trie_index: &[u16], trie_data: &[u16], exceptions: &[u16], ) -> Result<Self, DataError>131 pub fn try_from_icu( 132 trie_header: CodePointTrieHeader, 133 trie_index: &[u16], 134 trie_data: &[u16], 135 exceptions: &[u16], 136 ) -> Result<Self, DataError> { 137 use self::exceptions_builder::CaseMapExceptionsBuilder; 138 use zerovec::ZeroVec; 139 let exceptions_builder = CaseMapExceptionsBuilder::new(exceptions); 140 let (exceptions, idx_map) = exceptions_builder.build()?; 141 142 let trie_index = ZeroVec::alloc_from_slice(trie_index); 143 144 #[allow(clippy::unwrap_used)] // datagen only 145 let trie_data = trie_data 146 .iter() 147 .map(|&i| { 148 CaseMapData::try_from_icu_integer(i) 149 .unwrap() 150 .with_updated_exception(&idx_map) 151 }) 152 .collect::<ZeroVec<_>>(); 153 154 let trie = CodePointTrie::try_new(trie_header, trie_index, trie_data) 155 .map_err(|_| DataError::custom("Casemapping data does not form valid trie"))?; 156 157 let result = Self { trie, exceptions }; 158 result.validate().map_err(DataError::custom)?; 159 Ok(result) 160 } 161 162 /// Given an existing CaseMapper, validates that the data is 163 /// consistent. A CaseMapper created by the ICU transformer has 164 /// already been validated. Calling this function is only 165 /// necessary if you are concerned about data corruption after 166 /// deserializing. 167 #[cfg(any(feature = "serde", feature = "datagen"))] 168 #[allow(unused)] // is only used in debug mode for serde validate(&self) -> Result<(), &'static str>169 pub(crate) fn validate(&self) -> Result<(), &'static str> { 170 // First, validate that exception data is well-formed. 171 let valid_exception_indices = self.exceptions.validate()?; 172 173 let validate_delta = |c: char, delta: i32| -> Result<(), &'static str> { 174 let new_c = 175 u32::try_from(c as i32 + delta).map_err(|_| "Delta larger than character")?; 176 char::from_u32(new_c).ok_or("Invalid delta")?; 177 Ok(()) 178 }; 179 180 for i in 0..char::MAX as u32 { 181 if let Some(c) = char::from_u32(i) { 182 let data = self.lookup_data(c); 183 if data.has_exception() { 184 let idx = data.exception_index(); 185 let exception = self.exceptions.get(idx); 186 // Verify that the exception index points to a valid exception header. 187 if !valid_exception_indices.contains(&idx) { 188 return Err("Invalid exception index in trie data"); 189 } 190 exception.validate()?; 191 } else { 192 validate_delta(c, data.delta() as i32)?; 193 } 194 } 195 } 196 Ok(()) 197 } 198 lookup_data(&self, c: char) -> CaseMapData199 pub(crate) fn lookup_data(&self, c: char) -> CaseMapData { 200 self.trie.get32(c as u32) 201 } 202 } 203