1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[cfg(feature = "alloc")] 6 use crate::code_point_set::CodePointSetData; 7 use crate::props::GeneralCategory; 8 use crate::props::GeneralCategoryGroup; 9 use crate::provider::*; 10 use core::ops::RangeInclusive; 11 use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue}; 12 use icu_provider::marker::ErasedMarker; 13 use icu_provider::prelude::*; 14 15 /// A wrapper around code point map data. 16 /// 17 /// It is returned by APIs that return Unicode 18 /// property data in a map-like form, ex: enumerated property value data keyed 19 /// by code point. Access its data via the borrowed version, 20 /// [`CodePointMapDataBorrowed`]. 21 #[derive(Debug, Clone)] 22 pub struct CodePointMapData<T: TrieValue> { 23 data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>, 24 } 25 26 impl<T: TrieValue> CodePointMapData<T> { 27 /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`]. 28 /// 29 /// See the documentation on [`EnumeratedProperty`] implementations for details. 30 /// 31 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 32 /// 33 /// [ Help choosing a constructor](icu_provider::constructors) 34 #[cfg(feature = "compiled_data")] 35 #[allow(clippy::new_ret_no_self)] new() -> CodePointMapDataBorrowed<'static, T> where T: EnumeratedProperty,36 pub const fn new() -> CodePointMapDataBorrowed<'static, T> 37 where 38 T: EnumeratedProperty, 39 { 40 CodePointMapDataBorrowed::new() 41 } 42 43 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] try_new_unstable( provider: &(impl DataProvider<T::DataMarker> + ?Sized), ) -> Result<Self, DataError> where T: EnumeratedProperty,44 pub fn try_new_unstable( 45 provider: &(impl DataProvider<T::DataMarker> + ?Sized), 46 ) -> Result<Self, DataError> 47 where 48 T: EnumeratedProperty, 49 { 50 Ok(Self { 51 data: provider.load(Default::default())?.payload.cast(), 52 }) 53 } 54 55 /// Construct a borrowed version of this type that can be queried. 56 /// 57 /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it 58 /// up front. 59 /// 60 /// This owned version if returned by functions that use a runtime data provider. 61 #[inline] as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T>62 pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> { 63 CodePointMapDataBorrowed { 64 map: self.data.get(), 65 } 66 } 67 68 /// Convert this map to a map around another type 69 /// 70 /// Typically useful for type-erasing maps into maps around integers. 71 /// 72 /// # Panics 73 /// Will panic if T and P are different sizes 74 /// 75 /// # Example 76 /// 77 /// ``` 78 /// use icu::properties::CodePointMapData; 79 /// use icu::properties::props::GeneralCategory; 80 /// 81 /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned(); 82 /// 83 /// let gc = data.try_into_converted::<u8>().unwrap(); 84 /// let gc = gc.as_borrowed(); 85 /// 86 /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728 87 /// assert_eq!(gc.get(''), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN 88 /// ``` 89 #[cfg(feature = "alloc")] try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError> where P: TrieValue,90 pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError> 91 where 92 P: TrieValue, 93 { 94 self.data 95 .try_map_project(|data, _| data.try_into_converted()) 96 .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>) 97 } 98 99 /// Construct a new one from loaded data 100 /// 101 /// Typically it is preferable to use getters like [`load_general_category()`] instead from_data<M>(data: DataPayload<M>) -> Self where M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,102 pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self 103 where 104 M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>, 105 { 106 Self { data: data.cast() } 107 } 108 109 /// Construct a new one an owned [`CodePointTrie`] from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self110 pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self { 111 let set = PropertyCodePointMap::from_code_point_trie(trie); 112 CodePointMapData::from_data( 113 DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set), 114 ) 115 } 116 117 /// Convert this type to a [`CodePointTrie`] as a borrowed value. 118 /// 119 /// The data backing this is extensible and supports multiple implementations. 120 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be 121 /// added, and users may select which at data generation time. 122 /// 123 /// This method returns an `Option` in order to return `None` when the backing data provider 124 /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time 125 /// constraint. as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>>126 pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> { 127 self.data.get().as_code_point_trie() 128 } 129 130 /// Convert this type to a [`CodePointTrie`], borrowing if possible, 131 /// otherwise allocating a new [`CodePointTrie`]. 132 /// 133 /// The data backing this is extensible and supports multiple implementations. 134 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be 135 /// added, and users may select which at data generation time. 136 /// 137 /// The performance of the conversion to this specific return type will vary 138 /// depending on the data structure that is backing `self`. to_code_point_trie(&self) -> CodePointTrie<'_, T>139 pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> { 140 self.data.get().to_code_point_trie() 141 } 142 } 143 144 /// A borrowed wrapper around code point set data, returned by 145 /// [`CodePointSetData::as_borrowed()`]. More efficient to query. 146 #[derive(Clone, Copy, Debug)] 147 pub struct CodePointMapDataBorrowed<'a, T: TrieValue> { 148 map: &'a PropertyCodePointMap<'a, T>, 149 } 150 151 impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> { 152 /// Get the value this map has associated with code point `ch` 153 /// 154 /// # Example 155 /// 156 /// ``` 157 /// use icu::properties::CodePointMapData; 158 /// use icu::properties::props::GeneralCategory; 159 /// 160 /// let gc = CodePointMapData::<GeneralCategory>::new(); 161 /// 162 /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728 163 /// assert_eq!(gc.get(''), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN 164 /// ``` get(self, ch: char) -> T165 pub fn get(self, ch: char) -> T { 166 self.map.get32(ch as u32) 167 } 168 169 /// See [`Self::get`]. get32(self, ch: u32) -> T170 pub fn get32(self, ch: u32) -> T { 171 self.map.get32(ch) 172 } 173 174 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value 175 /// 176 /// # Example 177 /// 178 /// ``` 179 /// use icu::properties::props::GeneralCategory; 180 /// use icu::properties::CodePointMapData; 181 /// 182 /// let gc = CodePointMapData::<GeneralCategory>::new(); 183 /// 184 /// let other_letter_set_data = 185 /// gc.get_set_for_value(GeneralCategory::OtherLetter); 186 /// let other_letter_set = other_letter_set_data.as_borrowed(); 187 /// 188 /// assert!(other_letter_set.contains('木')); // U+6728 189 /// assert!(!other_letter_set.contains('')); // U+1F383 JACK-O-LANTERN 190 /// ``` 191 #[cfg(feature = "alloc")] get_set_for_value(self, value: T) -> CodePointSetData192 pub fn get_set_for_value(self, value: T) -> CodePointSetData { 193 let set = self.map.get_set_for_value(value); 194 CodePointSetData::from_code_point_inversion_list(set) 195 } 196 197 /// Yields an [`Iterator`] returning ranges of consecutive code points that 198 /// share the same value in the [`CodePointMapData`]. 199 /// 200 /// # Examples 201 /// 202 /// ``` 203 /// use icu::properties::props::GeneralCategory; 204 /// use icu::properties::CodePointMapData; 205 /// 206 /// let gc = CodePointMapData::<GeneralCategory>::new(); 207 /// let mut ranges = gc.iter_ranges(); 208 /// let next = ranges.next().unwrap(); 209 /// assert_eq!(next.range, 0..=31); 210 /// assert_eq!(next.value, GeneralCategory::Control); 211 /// let next = ranges.next().unwrap(); 212 /// assert_eq!(next.range, 32..=32); 213 /// assert_eq!(next.value, GeneralCategory::SpaceSeparator); 214 /// ``` iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a215 pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a { 216 self.map.iter_ranges() 217 } 218 219 /// Yields an [`Iterator`] returning ranges of consecutive code points that 220 /// share the same value `v` in the [`CodePointMapData`]. 221 /// 222 /// # Examples 223 /// 224 /// 225 /// ``` 226 /// use icu::properties::props::GeneralCategory; 227 /// use icu::properties::CodePointMapData; 228 /// 229 /// let gc = CodePointMapData::<GeneralCategory>::new(); 230 /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter); 231 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32); 232 /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32); 233 /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32); 234 /// ``` iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a235 pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { 236 self.map 237 .iter_ranges() 238 .filter(move |r| r.value == val) 239 .map(|r| r.range) 240 } 241 242 /// Yields an [`Iterator`] returning ranges of consecutive code points that 243 /// do *not* have the value `v` in the [`CodePointMapData`]. iter_ranges_for_value_complemented( self, val: T, ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a244 pub fn iter_ranges_for_value_complemented( 245 self, 246 val: T, 247 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { 248 self.map 249 .iter_ranges_mapped(move |value| value != val) 250 .filter(|v| v.value) 251 .map(|v| v.range) 252 } 253 254 /// Exposed for FFI needs, could be exposed in general in the future but we should 255 /// have a use case first. 256 /// 257 /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()` 258 #[doc(hidden)] // used by FFI code iter_ranges_mapped<U: Eq + 'a>( self, predicate: impl FnMut(T) -> U + Copy + 'a, ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a259 pub fn iter_ranges_mapped<U: Eq + 'a>( 260 self, 261 predicate: impl FnMut(T) -> U + Copy + 'a, 262 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a { 263 self.map.iter_ranges_mapped(predicate) 264 } 265 } 266 267 impl CodePointMapDataBorrowed<'_, GeneralCategory> { 268 /// TODO 269 #[cfg(feature = "alloc")] get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData270 pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData { 271 let matching_gc_ranges = self 272 .iter_ranges() 273 .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0) 274 .map(|cpm_range| cpm_range.range); 275 CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect()) 276 } 277 } 278 279 #[cfg(feature = "compiled_data")] 280 impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> { default() -> Self281 fn default() -> Self { 282 Self::new() 283 } 284 } 285 286 impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> { 287 /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`]. 288 /// 289 /// See the documentation on [`EnumeratedProperty`] implementations for details. 290 /// 291 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 292 /// 293 /// [ Help choosing a constructor](icu_provider::constructors) 294 #[cfg(feature = "compiled_data")] new() -> Self where T: EnumeratedProperty,295 pub const fn new() -> Self 296 where 297 T: EnumeratedProperty, 298 { 299 CodePointMapDataBorrowed { map: T::SINGLETON } 300 } 301 302 /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`]. 303 /// 304 /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some 305 /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`]. static_to_owned(self) -> CodePointMapData<T>306 pub const fn static_to_owned(self) -> CodePointMapData<T> { 307 CodePointMapData { 308 data: DataPayload::from_static_ref(self.map), 309 } 310 } 311 } 312 313 impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> { 314 /// Yields an [`Iterator`] returning ranges of consecutive code points that 315 /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`] 316 /// 317 /// # Examples 318 /// 319 /// ``` 320 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup}; 321 /// use icu::properties::CodePointMapData; 322 /// 323 /// let gc = CodePointMapData::<GeneralCategory>::new(); 324 /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter); 325 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32); 326 /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32); 327 /// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32); 328 /// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32); 329 /// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32); 330 /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32); 331 /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32); 332 /// ``` iter_ranges_for_group( self, group: GeneralCategoryGroup, ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a333 pub fn iter_ranges_for_group( 334 self, 335 group: GeneralCategoryGroup, 336 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { 337 self.map 338 .iter_ranges_mapped(move |value| group.contains(value)) 339 .filter(|v| v.value) 340 .map(|v| v.range) 341 } 342 } 343 344 /// A Unicode character property that assigns a value to each code point. 345 /// 346 /// The descriptions of most properties are taken from [`TR44`], the documentation for the 347 /// Unicode Character Database. 348 /// 349 /// <div class="stab unstable"> 350 /// This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this 351 /// trait, please consider using a type from the implementors listed below. 352 /// </div> 353 /// 354 /// [`TR44`]: https://www.unicode.org/reports/tr44 355 pub trait EnumeratedProperty: crate::private::Sealed + TrieValue { 356 #[doc(hidden)] 357 type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>; 358 #[doc(hidden)] 359 #[cfg(feature = "compiled_data")] 360 const SINGLETON: &'static PropertyCodePointMap<'static, Self>; 361 /// The name of this property 362 const NAME: &'static [u8]; 363 /// The abbreviated name of this property, if it exists, otherwise the name 364 const SHORT_NAME: &'static [u8]; 365 366 /// Convenience method for `CodePointMapData::new().get(ch)` 367 /// 368 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 369 #[cfg(feature = "compiled_data")] for_char(ch: char) -> Self370 fn for_char(ch: char) -> Self { 371 CodePointMapData::new().get(ch) 372 } 373 } 374