• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use crate::provider::*;
6 use core::ops::RangeInclusive;
7 use icu_collections::codepointinvlist::CodePointInversionList;
8 use icu_provider::marker::ErasedMarker;
9 use icu_provider::prelude::*;
10 
11 /// A set of Unicode code points. Access its data via the borrowed version,
12 /// [`CodePointSetDataBorrowed`].
13 ///
14 /// # Example
15 /// ```rust
16 /// use icu::properties::CodePointSetData;
17 /// use icu::properties::props::Alphabetic;
18 ///
19 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
20 ///
21 /// assert!(!alphabetic.contains('3'));
22 /// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
23 /// assert!(alphabetic.contains('A'));
24 /// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
25 /// ```
26 #[derive(Debug)]
27 pub struct CodePointSetData {
28     data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
29 }
30 
31 impl CodePointSetData {
32     /// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
33     ///
34     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
35     ///
36     /// [�� Help choosing a constructor](icu_provider::constructors)
37     #[allow(clippy::new_ret_no_self)]
38     #[cfg(feature = "compiled_data")]
new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static>39     pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
40         CodePointSetDataBorrowed::new::<P>()
41     }
42 
43     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
try_new_unstable<P: BinaryProperty>( provider: &(impl DataProvider<P::DataMarker> + ?Sized), ) -> Result<CodePointSetData, DataError>44     pub fn try_new_unstable<P: BinaryProperty>(
45         provider: &(impl DataProvider<P::DataMarker> + ?Sized),
46     ) -> Result<CodePointSetData, DataError> {
47         Ok(CodePointSetData::from_data(
48             provider.load(Default::default())?.payload,
49         ))
50     }
51 
52     /// Construct a borrowed version of this type that can be queried.
53     ///
54     /// This owned version if returned by functions that use a runtime data provider.
55     #[inline]
as_borrowed(&self) -> CodePointSetDataBorrowed<'_>56     pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
57         CodePointSetDataBorrowed {
58             set: self.data.get(),
59         }
60     }
61 
62     /// Construct a new one from loaded data
63     ///
64     /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
from_data<M>(data: DataPayload<M>) -> Self where M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,65     pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
66     where
67         M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
68     {
69         Self { data: data.cast() }
70     }
71 
72     /// Construct a new owned [`CodePointInversionList`]
from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self73     pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
74         let set = PropertyCodePointSet::from_code_point_inversion_list(set);
75         CodePointSetData::from_data(
76             DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
77         )
78     }
79 
80     /// Convert this type to a [`CodePointInversionList`] as a borrowed value.
81     ///
82     /// The data backing this is extensible and supports multiple implementations.
83     /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
84     /// added, and users may select which at data generation time.
85     ///
86     /// This method returns an `Option` in order to return `None` when the backing data provider
87     /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
88     /// constraint.
as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>>89     pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
90         self.data.get().as_code_point_inversion_list()
91     }
92 
93     /// Convert this type to a [`CodePointInversionList`], borrowing if possible,
94     /// otherwise allocating a new [`CodePointInversionList`].
95     ///
96     /// The data backing this is extensible and supports multiple implementations.
97     /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
98     /// added, and users may select which at data generation time.
99     ///
100     /// The performance of the conversion to this specific return type will vary
101     /// depending on the data structure that is backing `self`.
to_code_point_inversion_list(&self) -> CodePointInversionList<'_>102     pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
103         self.data.get().to_code_point_inversion_list()
104     }
105 }
106 
107 /// A borrowed wrapper around code point set data, returned by
108 /// [`CodePointSetData::as_borrowed()`]. More efficient to query.
109 #[derive(Clone, Copy, Debug)]
110 pub struct CodePointSetDataBorrowed<'a> {
111     set: &'a PropertyCodePointSet<'a>,
112 }
113 
114 impl CodePointSetDataBorrowed<'static> {
115     /// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
116     ///
117     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
118     ///
119     /// [�� Help choosing a constructor](icu_provider::constructors)
120     #[inline]
121     #[cfg(feature = "compiled_data")]
new<P: BinaryProperty>() -> Self122     pub const fn new<P: BinaryProperty>() -> Self {
123         CodePointSetDataBorrowed { set: P::SINGLETON }
124     }
125     /// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
126     ///
127     /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
128     /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
static_to_owned(self) -> CodePointSetData129     pub const fn static_to_owned(self) -> CodePointSetData {
130         CodePointSetData {
131             data: DataPayload::from_static_ref(self.set),
132         }
133     }
134 }
135 
136 impl<'a> CodePointSetDataBorrowed<'a> {
137     /// Check if the set contains a character
138     ///
139     /// ```rust
140     /// use icu::properties::CodePointSetData;
141     /// use icu::properties::props::Alphabetic;
142     ///
143     /// let alphabetic = CodePointSetData::new::<Alphabetic>();
144     ///
145     /// assert!(!alphabetic.contains('3'));
146     /// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
147     /// assert!(alphabetic.contains('A'));
148     /// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
149     /// ```
150     #[inline]
contains(self, ch: char) -> bool151     pub fn contains(self, ch: char) -> bool {
152         self.set.contains(ch)
153     }
154 
155     /// See [`Self::contains`].
156     #[inline]
contains32(self, ch: u32) -> bool157     pub fn contains32(self, ch: u32) -> bool {
158         self.set.contains32(ch)
159     }
160 
161     // Yields an [`Iterator`] returning the ranges of the code points that are
162     /// included in the [`CodePointSetData`]
163     ///
164     /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
165     /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
166     /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
167     ///
168     /// # Example
169     ///
170     /// ```
171     /// use icu::properties::props::Alphabetic;
172     /// use icu::properties::CodePointSetData;
173     ///
174     /// let alphabetic = CodePointSetData::new::<Alphabetic>();
175     /// let mut ranges = alphabetic.iter_ranges();
176     ///
177     /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
178     /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
179     /// ```
180     #[inline]
iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a181     pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
182         self.set.iter_ranges()
183     }
184 
185     // Yields an [`Iterator`] returning the ranges of the code points that are
186     /// *not* included in the [`CodePointSetData`]
187     ///
188     /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
189     /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
190     /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
191     ///
192     /// # Example
193     ///
194     /// ```
195     /// use icu::properties::props::Alphabetic;
196     /// use icu::properties::CodePointSetData;
197     ///
198     /// let alphabetic = CodePointSetData::new::<Alphabetic>();
199     /// let mut ranges = alphabetic.iter_ranges();
200     ///
201     /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
202     /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
203     /// ```
204     #[inline]
iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a205     pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
206         self.set.iter_ranges_complemented()
207     }
208 }
209 
210 /// A binary Unicode character property.
211 ///
212 /// The descriptions of most properties are taken from [`TR44`], the documentation for the
213 /// Unicode Character Database.  Some properties are instead defined in [`TR18`], the
214 /// documentation for Unicode regular expressions. In particular, Annex C of this document
215 /// defines properties for POSIX compatibility.
216 ///
217 /// <div class="stab unstable">
218 /// �� This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
219 /// trait, please consider using a type from the implementors listed below.
220 /// </div>
221 ///
222 /// [`TR44`]: https://www.unicode.org/reports/tr44
223 /// [`TR18`]: https://www.unicode.org/reports/tr18
224 pub trait BinaryProperty: crate::private::Sealed + Sized {
225     #[doc(hidden)]
226     type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
227     #[doc(hidden)]
228     #[cfg(feature = "compiled_data")]
229     const SINGLETON: &'static PropertyCodePointSet<'static>;
230     /// The name of this property
231     const NAME: &'static [u8];
232     /// The abbreviated name of this property, if it exists, otherwise the name
233     const SHORT_NAME: &'static [u8];
234 
235     /// Convenience method for `CodePointSetData::new().contains(ch)`
236     ///
237     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238     #[cfg(feature = "compiled_data")]
for_char(ch: char) -> bool239     fn for_char(ch: char) -> bool {
240         CodePointSetData::new::<Self>().contains(ch)
241     }
242 }
243 
244 #[cfg(test)]
245 mod tests {
246     #[test]
test_general_category()247     fn test_general_category() {
248         use icu::properties::props::GeneralCategory;
249         use icu::properties::props::GeneralCategoryGroup;
250         use icu::properties::CodePointMapData;
251 
252         let digits_data = CodePointMapData::<GeneralCategory>::new()
253             .get_set_for_value_group(GeneralCategoryGroup::Number);
254         let digits = digits_data.as_borrowed();
255 
256         assert!(digits.contains('5'));
257         assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
258         assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
259 
260         assert!(!digits.contains('A'));
261     }
262 
263     #[test]
test_script()264     fn test_script() {
265         use icu::properties::props::Script;
266         use icu::properties::CodePointMapData;
267 
268         let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
269         let thai = thai_data.as_borrowed();
270 
271         assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
272         assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
273 
274         assert!(!thai.contains('A'));
275         assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
276     }
277 
278     #[test]
test_gc_groupings()279     fn test_gc_groupings() {
280         use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
281         use icu::properties::CodePointMapData;
282         use icu_collections::codepointinvlist::CodePointInversionListBuilder;
283 
284         let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
285             let category_set =
286                 CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
287             let category_set = category_set
288                 .as_code_point_inversion_list()
289                 .expect("The data should be valid");
290 
291             let mut builder = CodePointInversionListBuilder::new();
292             for &subcategory in subcategories {
293                 let gc_set_data =
294                     CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
295                 let gc_set = gc_set_data.as_borrowed();
296                 for range in gc_set.iter_ranges() {
297                     builder.add_range32(range);
298                 }
299             }
300             let combined_set = builder.build();
301             println!("{category:?} {subcategories:?}");
302             assert_eq!(
303                 category_set.get_inversion_list_vec(),
304                 combined_set.get_inversion_list_vec()
305             );
306         };
307 
308         test_group(
309             GeneralCategoryGroup::Letter,
310             &[
311                 GeneralCategory::UppercaseLetter,
312                 GeneralCategory::LowercaseLetter,
313                 GeneralCategory::TitlecaseLetter,
314                 GeneralCategory::ModifierLetter,
315                 GeneralCategory::OtherLetter,
316             ],
317         );
318         test_group(
319             GeneralCategoryGroup::Other,
320             &[
321                 GeneralCategory::Control,
322                 GeneralCategory::Format,
323                 GeneralCategory::Unassigned,
324                 GeneralCategory::PrivateUse,
325                 GeneralCategory::Surrogate,
326             ],
327         );
328         test_group(
329             GeneralCategoryGroup::Mark,
330             &[
331                 GeneralCategory::SpacingMark,
332                 GeneralCategory::EnclosingMark,
333                 GeneralCategory::NonspacingMark,
334             ],
335         );
336         test_group(
337             GeneralCategoryGroup::Number,
338             &[
339                 GeneralCategory::DecimalNumber,
340                 GeneralCategory::LetterNumber,
341                 GeneralCategory::OtherNumber,
342             ],
343         );
344         test_group(
345             GeneralCategoryGroup::Punctuation,
346             &[
347                 GeneralCategory::ConnectorPunctuation,
348                 GeneralCategory::DashPunctuation,
349                 GeneralCategory::ClosePunctuation,
350                 GeneralCategory::FinalPunctuation,
351                 GeneralCategory::InitialPunctuation,
352                 GeneralCategory::OtherPunctuation,
353                 GeneralCategory::OpenPunctuation,
354             ],
355         );
356         test_group(
357             GeneralCategoryGroup::Symbol,
358             &[
359                 GeneralCategory::CurrencySymbol,
360                 GeneralCategory::ModifierSymbol,
361                 GeneralCategory::MathSymbol,
362                 GeneralCategory::OtherSymbol,
363             ],
364         );
365         test_group(
366             GeneralCategoryGroup::Separator,
367             &[
368                 GeneralCategory::LineSeparator,
369                 GeneralCategory::ParagraphSeparator,
370                 GeneralCategory::SpaceSeparator,
371             ],
372         );
373     }
374 
375     #[test]
test_gc_surrogate()376     fn test_gc_surrogate() {
377         use icu::properties::props::GeneralCategory;
378         use icu::properties::CodePointMapData;
379 
380         let surrogates_data = CodePointMapData::<GeneralCategory>::new()
381             .get_set_for_value(GeneralCategory::Surrogate);
382         let surrogates = surrogates_data.as_borrowed();
383 
384         assert!(surrogates.contains32(0xd800));
385         assert!(surrogates.contains32(0xd900));
386         assert!(surrogates.contains32(0xdfff));
387 
388         assert!(!surrogates.contains('A'));
389     }
390 }
391