// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). #[cfg(feature = "alloc")] use crate::code_point_set::CodePointSetData; use crate::props::GeneralCategory; use crate::props::GeneralCategoryGroup; use crate::provider::*; use core::ops::RangeInclusive; use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue}; use icu_provider::marker::ErasedMarker; use icu_provider::prelude::*; /// A wrapper around code point map data. /// /// It is returned by APIs that return Unicode /// property data in a map-like form, ex: enumerated property value data keyed /// by code point. Access its data via the borrowed version, /// [`CodePointMapDataBorrowed`]. #[derive(Debug, Clone)] pub struct CodePointMapData { data: DataPayload>>, } impl CodePointMapData { /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`]. /// /// See the documentation on [`EnumeratedProperty`] implementations for details. /// /// ✨ *Enabled with the `compiled_data` Cargo feature.* /// /// [πŸ“š Help choosing a constructor](icu_provider::constructors) #[cfg(feature = "compiled_data")] #[allow(clippy::new_ret_no_self)] pub const fn new() -> CodePointMapDataBorrowed<'static, T> where T: EnumeratedProperty, { CodePointMapDataBorrowed::new() } #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] pub fn try_new_unstable( provider: &(impl DataProvider + ?Sized), ) -> Result where T: EnumeratedProperty, { Ok(Self { data: provider.load(Default::default())?.payload.cast(), }) } /// Construct a borrowed version of this type that can be queried. /// /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it /// up front. /// /// This owned version if returned by functions that use a runtime data provider. #[inline] pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> { CodePointMapDataBorrowed { map: self.data.get(), } } /// Convert this map to a map around another type /// /// Typically useful for type-erasing maps into maps around integers. /// /// # Panics /// Will panic if T and P are different sizes /// /// # Example /// /// ``` /// use icu::properties::CodePointMapData; /// use icu::properties::props::GeneralCategory; /// /// let data = CodePointMapData::::new().static_to_owned(); /// /// let gc = data.try_into_converted::().unwrap(); /// let gc = gc.as_borrowed(); /// /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728 /// assert_eq!(gc.get('πŸŽƒ'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN /// ``` #[cfg(feature = "alloc")] pub fn try_into_converted

(self) -> Result, zerovec::ule::UleError> where P: TrieValue, { self.data .try_map_project(|data, _| data.try_into_converted()) .map(CodePointMapData::from_data::>>) } /// Construct a new one from loaded data /// /// Typically it is preferable to use getters like [`load_general_category()`] instead pub(crate) fn from_data(data: DataPayload) -> Self where M: DynamicDataMarker>, { Self { data: data.cast() } } /// Construct a new one an owned [`CodePointTrie`] pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self { let set = PropertyCodePointMap::from_code_point_trie(trie); CodePointMapData::from_data( DataPayload::>>::from_owned(set), ) } /// Convert this type to a [`CodePointTrie`] as a borrowed value. /// /// The data backing this is extensible and supports multiple implementations. /// Currently it is always [`CodePointTrie`]; however in the future more backends may be /// added, and users may select which at data generation time. /// /// This method returns an `Option` in order to return `None` when the backing data provider /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time /// constraint. pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> { self.data.get().as_code_point_trie() } /// Convert this type to a [`CodePointTrie`], borrowing if possible, /// otherwise allocating a new [`CodePointTrie`]. /// /// The data backing this is extensible and supports multiple implementations. /// Currently it is always [`CodePointTrie`]; however in the future more backends may be /// added, and users may select which at data generation time. /// /// The performance of the conversion to this specific return type will vary /// depending on the data structure that is backing `self`. pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> { self.data.get().to_code_point_trie() } } /// A borrowed wrapper around code point set data, returned by /// [`CodePointSetData::as_borrowed()`]. More efficient to query. #[derive(Clone, Copy, Debug)] pub struct CodePointMapDataBorrowed<'a, T: TrieValue> { map: &'a PropertyCodePointMap<'a, T>, } impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> { /// Get the value this map has associated with code point `ch` /// /// # Example /// /// ``` /// use icu::properties::CodePointMapData; /// use icu::properties::props::GeneralCategory; /// /// let gc = CodePointMapData::::new(); /// /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728 /// assert_eq!(gc.get('πŸŽƒ'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN /// ``` pub fn get(self, ch: char) -> T { self.map.get32(ch as u32) } /// See [`Self::get`]. pub fn get32(self, ch: u32) -> T { self.map.get32(ch) } /// Get a [`CodePointSetData`] for all elements corresponding to a particular value /// /// # Example /// /// ``` /// use icu::properties::props::GeneralCategory; /// use icu::properties::CodePointMapData; /// /// let gc = CodePointMapData::::new(); /// /// let other_letter_set_data = /// gc.get_set_for_value(GeneralCategory::OtherLetter); /// let other_letter_set = other_letter_set_data.as_borrowed(); /// /// assert!(other_letter_set.contains('木')); // U+6728 /// assert!(!other_letter_set.contains('πŸŽƒ')); // U+1F383 JACK-O-LANTERN /// ``` #[cfg(feature = "alloc")] pub fn get_set_for_value(self, value: T) -> CodePointSetData { let set = self.map.get_set_for_value(value); CodePointSetData::from_code_point_inversion_list(set) } /// Yields an [`Iterator`] returning ranges of consecutive code points that /// share the same value in the [`CodePointMapData`]. /// /// # Examples /// /// ``` /// use icu::properties::props::GeneralCategory; /// use icu::properties::CodePointMapData; /// /// let gc = CodePointMapData::::new(); /// let mut ranges = gc.iter_ranges(); /// let next = ranges.next().unwrap(); /// assert_eq!(next.range, 0..=31); /// assert_eq!(next.value, GeneralCategory::Control); /// let next = ranges.next().unwrap(); /// assert_eq!(next.range, 32..=32); /// assert_eq!(next.value, GeneralCategory::SpaceSeparator); /// ``` pub fn iter_ranges(self) -> impl Iterator> + 'a { self.map.iter_ranges() } /// Yields an [`Iterator`] returning ranges of consecutive code points that /// share the same value `v` in the [`CodePointMapData`]. /// /// # Examples /// /// /// ``` /// use icu::properties::props::GeneralCategory; /// use icu::properties::CodePointMapData; /// /// let gc = CodePointMapData::::new(); /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter); /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32); /// assert_eq!(ranges.next().unwrap(), 'Γ€' as u32..='Γ–' as u32); /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32); /// ``` pub fn iter_ranges_for_value(self, val: T) -> impl Iterator> + 'a { self.map .iter_ranges() .filter(move |r| r.value == val) .map(|r| r.range) } /// Yields an [`Iterator`] returning ranges of consecutive code points that /// do *not* have the value `v` in the [`CodePointMapData`]. pub fn iter_ranges_for_value_complemented( self, val: T, ) -> impl Iterator> + 'a { self.map .iter_ranges_mapped(move |value| value != val) .filter(|v| v.value) .map(|v| v.range) } /// Exposed for FFI needs, could be exposed in general in the future but we should /// have a use case first. /// /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()` #[doc(hidden)] // used by FFI code pub fn iter_ranges_mapped( self, predicate: impl FnMut(T) -> U + Copy + 'a, ) -> impl Iterator> + 'a { self.map.iter_ranges_mapped(predicate) } } impl CodePointMapDataBorrowed<'_, GeneralCategory> { /// TODO #[cfg(feature = "alloc")] pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData { let matching_gc_ranges = self .iter_ranges() .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0) .map(|cpm_range| cpm_range.range); CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect()) } } #[cfg(feature = "compiled_data")] impl Default for CodePointMapDataBorrowed<'static, T> { fn default() -> Self { Self::new() } } impl CodePointMapDataBorrowed<'static, T> { /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`]. /// /// See the documentation on [`EnumeratedProperty`] implementations for details. /// /// ✨ *Enabled with the `compiled_data` Cargo feature.* /// /// [πŸ“š Help choosing a constructor](icu_provider::constructors) #[cfg(feature = "compiled_data")] pub const fn new() -> Self where T: EnumeratedProperty, { CodePointMapDataBorrowed { map: T::SINGLETON } } /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`]. /// /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`]. pub const fn static_to_owned(self) -> CodePointMapData { CodePointMapData { data: DataPayload::from_static_ref(self.map), } } } impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> { /// Yields an [`Iterator`] returning ranges of consecutive code points that /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`] /// /// # Examples /// /// ``` /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup}; /// use icu::properties::CodePointMapData; /// /// let gc = CodePointMapData::::new(); /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter); /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32); /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32); /// assert_eq!(ranges.next().unwrap(), 'Βͺ' as u32..='Βͺ' as u32); /// assert_eq!(ranges.next().unwrap(), 'Β΅' as u32..='Β΅' as u32); /// assert_eq!(ranges.next().unwrap(), 'ΒΊ' as u32..='ΒΊ' as u32); /// assert_eq!(ranges.next().unwrap(), 'Γ€' as u32..='Γ–' as u32); /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ΓΆ' as u32); /// ``` pub fn iter_ranges_for_group( self, group: GeneralCategoryGroup, ) -> impl Iterator> + 'a { self.map .iter_ranges_mapped(move |value| group.contains(value)) .filter(|v| v.value) .map(|v| v.range) } } /// A Unicode character property that assigns a value to each code point. /// /// The descriptions of most properties are taken from [`TR44`], the documentation for the /// Unicode Character Database. /// ///

/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this /// trait, please consider using a type from the implementors listed below. ///
/// /// [`TR44`]: https://www.unicode.org/reports/tr44 pub trait EnumeratedProperty: crate::private::Sealed + TrieValue { #[doc(hidden)] type DataMarker: DataMarker>; #[doc(hidden)] #[cfg(feature = "compiled_data")] const SINGLETON: &'static PropertyCodePointMap<'static, Self>; /// The name of this property const NAME: &'static [u8]; /// The abbreviated name of this property, if it exists, otherwise the name const SHORT_NAME: &'static [u8]; /// Convenience method for `CodePointMapData::new().get(ch)` /// /// ✨ *Enabled with the `compiled_data` Cargo feature.* #[cfg(feature = "compiled_data")] fn for_char(ch: char) -> Self { CodePointMapData::new().get(ch) } }