• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 #[cfg(feature = "alloc")]
6 use crate::code_point_set::CodePointSetData;
7 use crate::props::GeneralCategory;
8 use crate::props::GeneralCategoryGroup;
9 use crate::provider::*;
10 use core::ops::RangeInclusive;
11 use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12 use icu_provider::marker::ErasedMarker;
13 use icu_provider::prelude::*;
14 
15 /// A wrapper around code point map data.
16 ///
17 /// It is returned by APIs that return Unicode
18 /// property data in a map-like form, ex: enumerated property value data keyed
19 /// by code point. Access its data via the borrowed version,
20 /// [`CodePointMapDataBorrowed`].
21 #[derive(Debug, Clone)]
22 pub struct CodePointMapData<T: TrieValue> {
23     data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24 }
25 
26 impl<T: TrieValue> CodePointMapData<T> {
27     /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28     ///
29     /// See the documentation on [`EnumeratedProperty`] implementations for details.
30     ///
31     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
32     ///
33     /// [�� Help choosing a constructor](icu_provider::constructors)
34     #[cfg(feature = "compiled_data")]
35     #[allow(clippy::new_ret_no_self)]
new() -> CodePointMapDataBorrowed<'static, T> where T: EnumeratedProperty,36     pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37     where
38         T: EnumeratedProperty,
39     {
40         CodePointMapDataBorrowed::new()
41     }
42 
43     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
try_new_unstable( provider: &(impl DataProvider<T::DataMarker> + ?Sized), ) -> Result<Self, DataError> where T: EnumeratedProperty,44     pub fn try_new_unstable(
45         provider: &(impl DataProvider<T::DataMarker> + ?Sized),
46     ) -> Result<Self, DataError>
47     where
48         T: EnumeratedProperty,
49     {
50         Ok(Self {
51             data: provider.load(Default::default())?.payload.cast(),
52         })
53     }
54 
55     /// Construct a borrowed version of this type that can be queried.
56     ///
57     /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
58     /// up front.
59     ///
60     /// This owned version if returned by functions that use a runtime data provider.
61     #[inline]
as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T>62     pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
63         CodePointMapDataBorrowed {
64             map: self.data.get(),
65         }
66     }
67 
68     /// Convert this map to a map around another type
69     ///
70     /// Typically useful for type-erasing maps into maps around integers.
71     ///
72     /// # Panics
73     /// Will panic if T and P are different sizes
74     ///
75     /// # Example
76     ///
77     /// ```
78     /// use icu::properties::CodePointMapData;
79     /// use icu::properties::props::GeneralCategory;
80     ///
81     /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
82     ///
83     /// let gc = data.try_into_converted::<u8>().unwrap();
84     /// let gc = gc.as_borrowed();
85     ///
86     /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8);  // U+6728
87     /// assert_eq!(gc.get('��'), GeneralCategory::OtherSymbol as u8);  // U+1F383 JACK-O-LANTERN
88     /// ```
89     #[cfg(feature = "alloc")]
try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError> where P: TrieValue,90     pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
91     where
92         P: TrieValue,
93     {
94         self.data
95             .try_map_project(|data, _| data.try_into_converted())
96             .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
97     }
98 
99     /// Construct a new one from loaded data
100     ///
101     /// Typically it is preferable to use getters like [`load_general_category()`] instead
from_data<M>(data: DataPayload<M>) -> Self where M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,102     pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
103     where
104         M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
105     {
106         Self { data: data.cast() }
107     }
108 
109     /// Construct a new one an owned [`CodePointTrie`]
from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self110     pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
111         let set = PropertyCodePointMap::from_code_point_trie(trie);
112         CodePointMapData::from_data(
113             DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
114         )
115     }
116 
117     /// Convert this type to a [`CodePointTrie`] as a borrowed value.
118     ///
119     /// The data backing this is extensible and supports multiple implementations.
120     /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
121     /// added, and users may select which at data generation time.
122     ///
123     /// This method returns an `Option` in order to return `None` when the backing data provider
124     /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
125     /// constraint.
as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>>126     pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
127         self.data.get().as_code_point_trie()
128     }
129 
130     /// Convert this type to a [`CodePointTrie`], borrowing if possible,
131     /// otherwise allocating a new [`CodePointTrie`].
132     ///
133     /// The data backing this is extensible and supports multiple implementations.
134     /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
135     /// added, and users may select which at data generation time.
136     ///
137     /// The performance of the conversion to this specific return type will vary
138     /// depending on the data structure that is backing `self`.
to_code_point_trie(&self) -> CodePointTrie<'_, T>139     pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
140         self.data.get().to_code_point_trie()
141     }
142 }
143 
144 /// A borrowed wrapper around code point set data, returned by
145 /// [`CodePointSetData::as_borrowed()`]. More efficient to query.
146 #[derive(Clone, Copy, Debug)]
147 pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
148     map: &'a PropertyCodePointMap<'a, T>,
149 }
150 
151 impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
152     /// Get the value this map has associated with code point `ch`
153     ///
154     /// # Example
155     ///
156     /// ```
157     /// use icu::properties::CodePointMapData;
158     /// use icu::properties::props::GeneralCategory;
159     ///
160     /// let gc = CodePointMapData::<GeneralCategory>::new();
161     ///
162     /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter);  // U+6728
163     /// assert_eq!(gc.get('��'), GeneralCategory::OtherSymbol);  // U+1F383 JACK-O-LANTERN
164     /// ```
get(self, ch: char) -> T165     pub fn get(self, ch: char) -> T {
166         self.map.get32(ch as u32)
167     }
168 
169     /// See [`Self::get`].
get32(self, ch: u32) -> T170     pub fn get32(self, ch: u32) -> T {
171         self.map.get32(ch)
172     }
173 
174     /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
175     ///
176     /// # Example
177     ///
178     /// ```
179     /// use icu::properties::props::GeneralCategory;
180     /// use icu::properties::CodePointMapData;
181     ///
182     /// let gc = CodePointMapData::<GeneralCategory>::new();
183     ///
184     /// let other_letter_set_data =
185     ///     gc.get_set_for_value(GeneralCategory::OtherLetter);
186     /// let other_letter_set = other_letter_set_data.as_borrowed();
187     ///
188     /// assert!(other_letter_set.contains('木')); // U+6728
189     /// assert!(!other_letter_set.contains('��')); // U+1F383 JACK-O-LANTERN
190     /// ```
191     #[cfg(feature = "alloc")]
get_set_for_value(self, value: T) -> CodePointSetData192     pub fn get_set_for_value(self, value: T) -> CodePointSetData {
193         let set = self.map.get_set_for_value(value);
194         CodePointSetData::from_code_point_inversion_list(set)
195     }
196 
197     /// Yields an [`Iterator`] returning ranges of consecutive code points that
198     /// share the same value in the [`CodePointMapData`].
199     ///
200     /// # Examples
201     ///
202     /// ```
203     /// use icu::properties::props::GeneralCategory;
204     /// use icu::properties::CodePointMapData;
205     ///
206     /// let gc = CodePointMapData::<GeneralCategory>::new();
207     /// let mut ranges = gc.iter_ranges();
208     /// let next = ranges.next().unwrap();
209     /// assert_eq!(next.range, 0..=31);
210     /// assert_eq!(next.value, GeneralCategory::Control);
211     /// let next = ranges.next().unwrap();
212     /// assert_eq!(next.range, 32..=32);
213     /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
214     /// ```
iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a215     pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
216         self.map.iter_ranges()
217     }
218 
219     /// Yields an [`Iterator`] returning ranges of consecutive code points that
220     /// share the same value `v` in the [`CodePointMapData`].
221     ///
222     /// # Examples
223     ///
224     ///
225     /// ```
226     /// use icu::properties::props::GeneralCategory;
227     /// use icu::properties::CodePointMapData;
228     ///
229     /// let gc = CodePointMapData::<GeneralCategory>::new();
230     /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
231     /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
232     /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
233     /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
234     /// ```
iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a235     pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
236         self.map
237             .iter_ranges()
238             .filter(move |r| r.value == val)
239             .map(|r| r.range)
240     }
241 
242     /// Yields an [`Iterator`] returning ranges of consecutive code points that
243     /// do *not* have the value `v` in the [`CodePointMapData`].
iter_ranges_for_value_complemented( self, val: T, ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a244     pub fn iter_ranges_for_value_complemented(
245         self,
246         val: T,
247     ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
248         self.map
249             .iter_ranges_mapped(move |value| value != val)
250             .filter(|v| v.value)
251             .map(|v| v.range)
252     }
253 
254     /// Exposed for FFI needs, could be exposed in general in the future but we should
255     /// have a use case first.
256     ///
257     /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
258     #[doc(hidden)] // used by FFI code
iter_ranges_mapped<U: Eq + 'a>( self, predicate: impl FnMut(T) -> U + Copy + 'a, ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a259     pub fn iter_ranges_mapped<U: Eq + 'a>(
260         self,
261         predicate: impl FnMut(T) -> U + Copy + 'a,
262     ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
263         self.map.iter_ranges_mapped(predicate)
264     }
265 }
266 
267 impl CodePointMapDataBorrowed<'_, GeneralCategory> {
268     /// TODO
269     #[cfg(feature = "alloc")]
get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData270     pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
271         let matching_gc_ranges = self
272             .iter_ranges()
273             .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
274             .map(|cpm_range| cpm_range.range);
275         CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
276     }
277 }
278 
279 #[cfg(feature = "compiled_data")]
280 impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
default() -> Self281     fn default() -> Self {
282         Self::new()
283     }
284 }
285 
286 impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
287     /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
288     ///
289     /// See the documentation on [`EnumeratedProperty`] implementations for details.
290     ///
291     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
292     ///
293     /// [�� Help choosing a constructor](icu_provider::constructors)
294     #[cfg(feature = "compiled_data")]
new() -> Self where T: EnumeratedProperty,295     pub const fn new() -> Self
296     where
297         T: EnumeratedProperty,
298     {
299         CodePointMapDataBorrowed { map: T::SINGLETON }
300     }
301 
302     /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
303     ///
304     /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
305     /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
static_to_owned(self) -> CodePointMapData<T>306     pub const fn static_to_owned(self) -> CodePointMapData<T> {
307         CodePointMapData {
308             data: DataPayload::from_static_ref(self.map),
309         }
310     }
311 }
312 
313 impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
314     /// Yields an [`Iterator`] returning ranges of consecutive code points that
315     /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
316     ///
317     /// # Examples
318     ///
319     /// ```
320     /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
321     /// use icu::properties::CodePointMapData;
322     ///
323     /// let gc = CodePointMapData::<GeneralCategory>::new();
324     /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
325     /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
326     /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
327     /// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32);
328     /// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32);
329     /// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32);
330     /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
331     /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32);
332     /// ```
iter_ranges_for_group( self, group: GeneralCategoryGroup, ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a333     pub fn iter_ranges_for_group(
334         self,
335         group: GeneralCategoryGroup,
336     ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
337         self.map
338             .iter_ranges_mapped(move |value| group.contains(value))
339             .filter(|v| v.value)
340             .map(|v| v.range)
341     }
342 }
343 
344 /// A Unicode character property that assigns a value to each code point.
345 ///
346 /// The descriptions of most properties are taken from [`TR44`], the documentation for the
347 /// Unicode Character Database.
348 ///
349 /// <div class="stab unstable">
350 /// �� This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
351 /// trait, please consider using a type from the implementors listed below.
352 /// </div>
353 ///
354 /// [`TR44`]: https://www.unicode.org/reports/tr44
355 pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
356     #[doc(hidden)]
357     type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
358     #[doc(hidden)]
359     #[cfg(feature = "compiled_data")]
360     const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
361     /// The name of this property
362     const NAME: &'static [u8];
363     /// The abbreviated name of this property, if it exists, otherwise the name
364     const SHORT_NAME: &'static [u8];
365 
366     /// Convenience method for `CodePointMapData::new().get(ch)`
367     ///
368     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
369     #[cfg(feature = "compiled_data")]
for_char(ch: char) -> Self370     fn for_char(ch: char) -> Self {
371         CodePointMapData::new().get(ch)
372     }
373 }
374