1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 use crate::provider::*; 6 use core::ops::RangeInclusive; 7 use icu_collections::codepointinvlist::CodePointInversionList; 8 use icu_provider::marker::ErasedMarker; 9 use icu_provider::prelude::*; 10 11 /// A set of Unicode code points. Access its data via the borrowed version, 12 /// [`CodePointSetDataBorrowed`]. 13 /// 14 /// # Example 15 /// ```rust 16 /// use icu::properties::CodePointSetData; 17 /// use icu::properties::props::Alphabetic; 18 /// 19 /// let alphabetic = CodePointSetData::new::<Alphabetic>(); 20 /// 21 /// assert!(!alphabetic.contains('3')); 22 /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE 23 /// assert!(alphabetic.contains('A')); 24 /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS 25 /// ``` 26 #[derive(Debug)] 27 pub struct CodePointSetData { 28 data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>, 29 } 30 31 impl CodePointSetData { 32 /// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`]. 33 /// 34 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 35 /// 36 /// [ Help choosing a constructor](icu_provider::constructors) 37 #[allow(clippy::new_ret_no_self)] 38 #[cfg(feature = "compiled_data")] new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static>39 pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> { 40 CodePointSetDataBorrowed::new::<P>() 41 } 42 43 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] try_new_unstable<P: BinaryProperty>( provider: &(impl DataProvider<P::DataMarker> + ?Sized), ) -> Result<CodePointSetData, DataError>44 pub fn try_new_unstable<P: BinaryProperty>( 45 provider: &(impl DataProvider<P::DataMarker> + ?Sized), 46 ) -> Result<CodePointSetData, DataError> { 47 Ok(CodePointSetData::from_data( 48 provider.load(Default::default())?.payload, 49 )) 50 } 51 52 /// Construct a borrowed version of this type that can be queried. 53 /// 54 /// This owned version if returned by functions that use a runtime data provider. 55 #[inline] as_borrowed(&self) -> CodePointSetDataBorrowed<'_>56 pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> { 57 CodePointSetDataBorrowed { 58 set: self.data.get(), 59 } 60 } 61 62 /// Construct a new one from loaded data 63 /// 64 /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead from_data<M>(data: DataPayload<M>) -> Self where M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,65 pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self 66 where 67 M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>, 68 { 69 Self { data: data.cast() } 70 } 71 72 /// Construct a new owned [`CodePointInversionList`] from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self73 pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self { 74 let set = PropertyCodePointSet::from_code_point_inversion_list(set); 75 CodePointSetData::from_data( 76 DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set), 77 ) 78 } 79 80 /// Convert this type to a [`CodePointInversionList`] as a borrowed value. 81 /// 82 /// The data backing this is extensible and supports multiple implementations. 83 /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be 84 /// added, and users may select which at data generation time. 85 /// 86 /// This method returns an `Option` in order to return `None` when the backing data provider 87 /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time 88 /// constraint. as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>>89 pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> { 90 self.data.get().as_code_point_inversion_list() 91 } 92 93 /// Convert this type to a [`CodePointInversionList`], borrowing if possible, 94 /// otherwise allocating a new [`CodePointInversionList`]. 95 /// 96 /// The data backing this is extensible and supports multiple implementations. 97 /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be 98 /// added, and users may select which at data generation time. 99 /// 100 /// The performance of the conversion to this specific return type will vary 101 /// depending on the data structure that is backing `self`. to_code_point_inversion_list(&self) -> CodePointInversionList<'_>102 pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> { 103 self.data.get().to_code_point_inversion_list() 104 } 105 } 106 107 /// A borrowed wrapper around code point set data, returned by 108 /// [`CodePointSetData::as_borrowed()`]. More efficient to query. 109 #[derive(Clone, Copy, Debug)] 110 pub struct CodePointSetDataBorrowed<'a> { 111 set: &'a PropertyCodePointSet<'a>, 112 } 113 114 impl CodePointSetDataBorrowed<'static> { 115 /// Creates a new [`CodePointSetData`] for a [`BinaryProperty`]. 116 /// 117 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 118 /// 119 /// [ Help choosing a constructor](icu_provider::constructors) 120 #[inline] 121 #[cfg(feature = "compiled_data")] new<P: BinaryProperty>() -> Self122 pub const fn new<P: BinaryProperty>() -> Self { 123 CodePointSetDataBorrowed { set: P::SINGLETON } 124 } 125 /// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`]. 126 /// 127 /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some 128 /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`]. static_to_owned(self) -> CodePointSetData129 pub const fn static_to_owned(self) -> CodePointSetData { 130 CodePointSetData { 131 data: DataPayload::from_static_ref(self.set), 132 } 133 } 134 } 135 136 impl<'a> CodePointSetDataBorrowed<'a> { 137 /// Check if the set contains a character 138 /// 139 /// ```rust 140 /// use icu::properties::CodePointSetData; 141 /// use icu::properties::props::Alphabetic; 142 /// 143 /// let alphabetic = CodePointSetData::new::<Alphabetic>(); 144 /// 145 /// assert!(!alphabetic.contains('3')); 146 /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE 147 /// assert!(alphabetic.contains('A')); 148 /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS 149 /// ``` 150 #[inline] contains(self, ch: char) -> bool151 pub fn contains(self, ch: char) -> bool { 152 self.set.contains(ch) 153 } 154 155 /// See [`Self::contains`]. 156 #[inline] contains32(self, ch: u32) -> bool157 pub fn contains32(self, ch: u32) -> bool { 158 self.set.contains32(ch) 159 } 160 161 // Yields an [`Iterator`] returning the ranges of the code points that are 162 /// included in the [`CodePointSetData`] 163 /// 164 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its 165 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J 166 /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`. 167 /// 168 /// # Example 169 /// 170 /// ``` 171 /// use icu::properties::props::Alphabetic; 172 /// use icu::properties::CodePointSetData; 173 /// 174 /// let alphabetic = CodePointSetData::new::<Alphabetic>(); 175 /// let mut ranges = alphabetic.iter_ranges(); 176 /// 177 /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z' 178 /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z' 179 /// ``` 180 #[inline] iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a181 pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { 182 self.set.iter_ranges() 183 } 184 185 // Yields an [`Iterator`] returning the ranges of the code points that are 186 /// *not* included in the [`CodePointSetData`] 187 /// 188 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its 189 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J 190 /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`. 191 /// 192 /// # Example 193 /// 194 /// ``` 195 /// use icu::properties::props::Alphabetic; 196 /// use icu::properties::CodePointSetData; 197 /// 198 /// let alphabetic = CodePointSetData::new::<Alphabetic>(); 199 /// let mut ranges = alphabetic.iter_ranges(); 200 /// 201 /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z' 202 /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z' 203 /// ``` 204 #[inline] iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a205 pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { 206 self.set.iter_ranges_complemented() 207 } 208 } 209 210 /// A binary Unicode character property. 211 /// 212 /// The descriptions of most properties are taken from [`TR44`], the documentation for the 213 /// Unicode Character Database. Some properties are instead defined in [`TR18`], the 214 /// documentation for Unicode regular expressions. In particular, Annex C of this document 215 /// defines properties for POSIX compatibility. 216 /// 217 /// <div class="stab unstable"> 218 /// This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this 219 /// trait, please consider using a type from the implementors listed below. 220 /// </div> 221 /// 222 /// [`TR44`]: https://www.unicode.org/reports/tr44 223 /// [`TR18`]: https://www.unicode.org/reports/tr18 224 pub trait BinaryProperty: crate::private::Sealed + Sized { 225 #[doc(hidden)] 226 type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>; 227 #[doc(hidden)] 228 #[cfg(feature = "compiled_data")] 229 const SINGLETON: &'static PropertyCodePointSet<'static>; 230 /// The name of this property 231 const NAME: &'static [u8]; 232 /// The abbreviated name of this property, if it exists, otherwise the name 233 const SHORT_NAME: &'static [u8]; 234 235 /// Convenience method for `CodePointSetData::new().contains(ch)` 236 /// 237 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 238 #[cfg(feature = "compiled_data")] for_char(ch: char) -> bool239 fn for_char(ch: char) -> bool { 240 CodePointSetData::new::<Self>().contains(ch) 241 } 242 } 243 244 #[cfg(test)] 245 mod tests { 246 #[test] test_general_category()247 fn test_general_category() { 248 use icu::properties::props::GeneralCategory; 249 use icu::properties::props::GeneralCategoryGroup; 250 use icu::properties::CodePointMapData; 251 252 let digits_data = CodePointMapData::<GeneralCategory>::new() 253 .get_set_for_value_group(GeneralCategoryGroup::Number); 254 let digits = digits_data.as_borrowed(); 255 256 assert!(digits.contains('5')); 257 assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE 258 assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE 259 260 assert!(!digits.contains('A')); 261 } 262 263 #[test] test_script()264 fn test_script() { 265 use icu::properties::props::Script; 266 use icu::properties::CodePointMapData; 267 268 let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai); 269 let thai = thai_data.as_borrowed(); 270 271 assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI 272 assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO 273 274 assert!(!thai.contains('A')); 275 assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT 276 } 277 278 #[test] test_gc_groupings()279 fn test_gc_groupings() { 280 use icu::properties::props::{GeneralCategory, GeneralCategoryGroup}; 281 use icu::properties::CodePointMapData; 282 use icu_collections::codepointinvlist::CodePointInversionListBuilder; 283 284 let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| { 285 let category_set = 286 CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category); 287 let category_set = category_set 288 .as_code_point_inversion_list() 289 .expect("The data should be valid"); 290 291 let mut builder = CodePointInversionListBuilder::new(); 292 for &subcategory in subcategories { 293 let gc_set_data = 294 CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory); 295 let gc_set = gc_set_data.as_borrowed(); 296 for range in gc_set.iter_ranges() { 297 builder.add_range32(range); 298 } 299 } 300 let combined_set = builder.build(); 301 println!("{category:?} {subcategories:?}"); 302 assert_eq!( 303 category_set.get_inversion_list_vec(), 304 combined_set.get_inversion_list_vec() 305 ); 306 }; 307 308 test_group( 309 GeneralCategoryGroup::Letter, 310 &[ 311 GeneralCategory::UppercaseLetter, 312 GeneralCategory::LowercaseLetter, 313 GeneralCategory::TitlecaseLetter, 314 GeneralCategory::ModifierLetter, 315 GeneralCategory::OtherLetter, 316 ], 317 ); 318 test_group( 319 GeneralCategoryGroup::Other, 320 &[ 321 GeneralCategory::Control, 322 GeneralCategory::Format, 323 GeneralCategory::Unassigned, 324 GeneralCategory::PrivateUse, 325 GeneralCategory::Surrogate, 326 ], 327 ); 328 test_group( 329 GeneralCategoryGroup::Mark, 330 &[ 331 GeneralCategory::SpacingMark, 332 GeneralCategory::EnclosingMark, 333 GeneralCategory::NonspacingMark, 334 ], 335 ); 336 test_group( 337 GeneralCategoryGroup::Number, 338 &[ 339 GeneralCategory::DecimalNumber, 340 GeneralCategory::LetterNumber, 341 GeneralCategory::OtherNumber, 342 ], 343 ); 344 test_group( 345 GeneralCategoryGroup::Punctuation, 346 &[ 347 GeneralCategory::ConnectorPunctuation, 348 GeneralCategory::DashPunctuation, 349 GeneralCategory::ClosePunctuation, 350 GeneralCategory::FinalPunctuation, 351 GeneralCategory::InitialPunctuation, 352 GeneralCategory::OtherPunctuation, 353 GeneralCategory::OpenPunctuation, 354 ], 355 ); 356 test_group( 357 GeneralCategoryGroup::Symbol, 358 &[ 359 GeneralCategory::CurrencySymbol, 360 GeneralCategory::ModifierSymbol, 361 GeneralCategory::MathSymbol, 362 GeneralCategory::OtherSymbol, 363 ], 364 ); 365 test_group( 366 GeneralCategoryGroup::Separator, 367 &[ 368 GeneralCategory::LineSeparator, 369 GeneralCategory::ParagraphSeparator, 370 GeneralCategory::SpaceSeparator, 371 ], 372 ); 373 } 374 375 #[test] test_gc_surrogate()376 fn test_gc_surrogate() { 377 use icu::properties::props::GeneralCategory; 378 use icu::properties::CodePointMapData; 379 380 let surrogates_data = CodePointMapData::<GeneralCategory>::new() 381 .get_set_for_value(GeneralCategory::Surrogate); 382 let surrogates = surrogates_data.as_borrowed(); 383 384 assert!(surrogates.contains32(0xd800)); 385 assert!(surrogates.contains32(0xd900)); 386 assert!(surrogates.contains32(0xdfff)); 387 388 assert!(!surrogates.contains('A')); 389 } 390 } 391