1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! Data and APIs for supporting Script_Extensions property 6 //! values in an efficient structure. 7 8 use crate::props::Script; 9 use crate::provider::*; 10 11 #[cfg(feature = "alloc")] 12 use core::iter::FromIterator; 13 use core::ops::RangeInclusive; 14 #[cfg(feature = "alloc")] 15 use icu_collections::codepointinvlist::CodePointInversionList; 16 use icu_provider::prelude::*; 17 use zerovec::{ule::AsULE, ZeroSlice}; 18 19 /// The number of bits at the low-end of a `ScriptWithExt` value used for 20 /// storing the `Script` value (or `extensions` index). 21 const SCRIPT_VAL_LENGTH: u16 = 10; 22 23 /// The bit mask necessary to retrieve the `Script` value (or `extensions` index) 24 /// from a `ScriptWithExt` value. 25 const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1; 26 27 /// An internal-use only pseudo-property that represents the values stored in 28 /// the trie of the special data structure [`ScriptWithExtensionsProperty`]. 29 /// 30 /// Note: The will assume a 12-bit layout. The 2 higher order bits in positions 31 /// 11..10 will indicate how to deduce the Script value and Script_Extensions, 32 /// and the lower 10 bits 9..0 indicate either the Script value or the index 33 /// into the `extensions` structure. 34 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 35 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 36 #[cfg_attr(feature = "datagen", derive(databake::Bake))] 37 #[cfg_attr(feature = "datagen", databake(path = icu_properties::script))] 38 #[repr(transparent)] 39 #[doc(hidden)] 40 // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor 41 #[allow(clippy::exhaustive_structs)] // this type is stable 42 pub struct ScriptWithExt(pub u16); 43 44 #[allow(missing_docs)] // These constants don't need individual documentation. 45 #[allow(non_upper_case_globals)] 46 #[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor 47 impl ScriptWithExt { 48 pub const Unknown: ScriptWithExt = ScriptWithExt(0); 49 } 50 51 impl AsULE for ScriptWithExt { 52 type ULE = <u16 as AsULE>::ULE; 53 54 #[inline] to_unaligned(self) -> Self::ULE55 fn to_unaligned(self) -> Self::ULE { 56 Script(self.0).to_unaligned() 57 } 58 59 #[inline] from_unaligned(unaligned: Self::ULE) -> Self60 fn from_unaligned(unaligned: Self::ULE) -> Self { 61 ScriptWithExt(Script::from_unaligned(unaligned).0) 62 } 63 } 64 65 #[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor 66 impl ScriptWithExt { 67 /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and 68 /// also indicates a Script value of [`Script::Common`]. 69 /// 70 /// # Examples 71 /// 72 /// ``` 73 /// use icu::properties::script::ScriptWithExt; 74 /// 75 /// assert!(ScriptWithExt(0x04FF).is_common()); 76 /// assert!(ScriptWithExt(0x0400).is_common()); 77 /// 78 /// assert!(!ScriptWithExt(0x08FF).is_common()); 79 /// assert!(!ScriptWithExt(0x0800).is_common()); 80 /// 81 /// assert!(!ScriptWithExt(0x0CFF).is_common()); 82 /// assert!(!ScriptWithExt(0x0C00).is_common()); 83 /// 84 /// assert!(!ScriptWithExt(0xFF).is_common()); 85 /// assert!(!ScriptWithExt(0x0).is_common()); 86 /// ``` is_common(&self) -> bool87 pub fn is_common(&self) -> bool { 88 self.0 >> SCRIPT_VAL_LENGTH == 1 89 } 90 91 /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and 92 /// also indicates a Script value of [`Script::Inherited`]. 93 /// 94 /// # Examples 95 /// 96 /// ``` 97 /// use icu::properties::script::ScriptWithExt; 98 /// 99 /// assert!(!ScriptWithExt(0x04FF).is_inherited()); 100 /// assert!(!ScriptWithExt(0x0400).is_inherited()); 101 /// 102 /// assert!(ScriptWithExt(0x08FF).is_inherited()); 103 /// assert!(ScriptWithExt(0x0800).is_inherited()); 104 /// 105 /// assert!(!ScriptWithExt(0x0CFF).is_inherited()); 106 /// assert!(!ScriptWithExt(0x0C00).is_inherited()); 107 /// 108 /// assert!(!ScriptWithExt(0xFF).is_inherited()); 109 /// assert!(!ScriptWithExt(0x0).is_inherited()); 110 /// ``` is_inherited(&self) -> bool111 pub fn is_inherited(&self) -> bool { 112 self.0 >> SCRIPT_VAL_LENGTH == 2 113 } 114 115 /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and 116 /// also indicates that the Script value is neither [`Script::Common`] nor 117 /// [`Script::Inherited`]. 118 /// 119 /// # Examples 120 /// 121 /// ``` 122 /// use icu::properties::script::ScriptWithExt; 123 /// 124 /// assert!(!ScriptWithExt(0x04FF).is_other()); 125 /// assert!(!ScriptWithExt(0x0400).is_other()); 126 /// 127 /// assert!(!ScriptWithExt(0x08FF).is_other()); 128 /// assert!(!ScriptWithExt(0x0800).is_other()); 129 /// 130 /// assert!(ScriptWithExt(0x0CFF).is_other()); 131 /// assert!(ScriptWithExt(0x0C00).is_other()); 132 /// 133 /// assert!(!ScriptWithExt(0xFF).is_other()); 134 /// assert!(!ScriptWithExt(0x0).is_other()); 135 /// ``` is_other(&self) -> bool136 pub fn is_other(&self) -> bool { 137 self.0 >> SCRIPT_VAL_LENGTH == 3 138 } 139 140 /// Returns whether the [`ScriptWithExt`] value has Script_Extensions. 141 /// 142 /// # Examples 143 /// 144 /// ``` 145 /// use icu::properties::script::ScriptWithExt; 146 /// 147 /// assert!(ScriptWithExt(0x04FF).has_extensions()); 148 /// assert!(ScriptWithExt(0x0400).has_extensions()); 149 /// 150 /// assert!(ScriptWithExt(0x08FF).has_extensions()); 151 /// assert!(ScriptWithExt(0x0800).has_extensions()); 152 /// 153 /// assert!(ScriptWithExt(0x0CFF).has_extensions()); 154 /// assert!(ScriptWithExt(0x0C00).has_extensions()); 155 /// 156 /// assert!(!ScriptWithExt(0xFF).has_extensions()); 157 /// assert!(!ScriptWithExt(0x0).has_extensions()); 158 /// ``` has_extensions(&self) -> bool159 pub fn has_extensions(&self) -> bool { 160 let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH; 161 high_order_bits > 0 162 } 163 } 164 165 impl From<ScriptWithExt> for u32 { from(swe: ScriptWithExt) -> Self166 fn from(swe: ScriptWithExt) -> Self { 167 swe.0 as u32 168 } 169 } 170 171 impl From<ScriptWithExt> for Script { from(swe: ScriptWithExt) -> Self172 fn from(swe: ScriptWithExt) -> Self { 173 Script(swe.0) 174 } 175 } 176 177 /// A struct that wraps a [`Script`] array, such as in the return value for 178 /// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val). 179 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 180 pub struct ScriptExtensionsSet<'a> { 181 values: &'a ZeroSlice<Script>, 182 } 183 184 impl<'a> ScriptExtensionsSet<'a> { 185 /// Returns whether this set contains the given script. 186 /// 187 /// # Example 188 /// 189 /// ``` 190 /// use icu::properties::props::Script; 191 /// use icu::properties::script::ScriptWithExtensions; 192 /// let swe = ScriptWithExtensions::new(); 193 /// 194 /// assert!(swe 195 /// .get_script_extensions_val('\u{11303}') // GRANTHA SIGN VISARGA 196 /// .contains(&Script::Grantha)); 197 /// ``` contains(&self, x: &Script) -> bool198 pub fn contains(&self, x: &Script) -> bool { 199 ZeroSlice::binary_search(self.values, x).is_ok() 200 } 201 202 /// Gets an iterator over the elements. 203 /// 204 /// # Example 205 /// 206 /// ``` 207 /// use icu::properties::props::Script; 208 /// use icu::properties::script::ScriptWithExtensions; 209 /// let swe = ScriptWithExtensions::new(); 210 /// 211 /// assert_eq!( 212 /// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE 213 /// .iter() 214 /// .collect::<Vec<_>>(), 215 /// [Script::Tamil, Script::Grantha] 216 /// ); 217 /// ``` iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a218 pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a { 219 ZeroSlice::iter(self.values) 220 } 221 222 /// For accessing this set as an array instead of an iterator 223 #[doc(hidden)] // used by FFI code array_len(&self) -> usize224 pub fn array_len(&self) -> usize { 225 self.values.len() 226 } 227 /// For accessing this set as an array instead of an iterator 228 #[doc(hidden)] // used by FFI code array_get(&self, index: usize) -> Option<Script>229 pub fn array_get(&self, index: usize) -> Option<Script> { 230 self.values.get(index) 231 } 232 } 233 234 /// A struct that represents the data for the Script and Script_Extensions properties. 235 /// 236 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 237 /// 238 /// [ Help choosing a constructor](icu_provider::constructors) 239 /// 240 /// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`] 241 /// 242 /// # Examples 243 /// 244 /// ``` 245 /// use icu::properties::script::ScriptWithExtensions; 246 /// use icu::properties::props::Script; 247 /// let swe = ScriptWithExtensions::new(); 248 /// 249 /// // get the `Script` property value 250 /// assert_eq!(swe.get_script_val('ـ'), Script::Common); // U+0640 ARABIC TATWEEL 251 /// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // U+0650 ARABIC KASRA 252 /// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO 253 /// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM 254 /// 255 /// // get the `Script_Extensions` property value 256 /// assert_eq!( 257 /// swe.get_script_extensions_val('ـ') // U+0640 ARABIC TATWEEL 258 /// .iter().collect::<Vec<_>>(), 259 /// [Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean, 260 /// Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian, 261 /// Script::OldUyghur] 262 /// ); 263 /// assert_eq!( 264 /// swe.get_script_extensions_val('') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT 265 /// .iter().collect::<Vec<_>>(), 266 /// [Script::Common] 267 /// ); 268 /// assert_eq!( 269 /// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER 270 /// .iter().collect::<Vec<_>>(), 271 /// [Script::Inherited] 272 /// ); 273 /// assert_eq!( 274 /// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE 275 /// .iter().collect::<Vec<_>>(), 276 /// [Script::Tamil, Script::Grantha] 277 /// ); 278 /// 279 /// // check containment of a `Script` value in the `Script_Extensions` value 280 /// // U+0650 ARABIC KASRA 281 /// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value 282 /// assert!(swe.has_script('\u{0650}', Script::Arabic)); 283 /// assert!(swe.has_script('\u{0650}', Script::Syriac)); 284 /// assert!(!swe.has_script('\u{0650}', Script::Thaana)); 285 /// 286 /// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value 287 /// let syriac = swe.get_script_extensions_set(Script::Syriac); 288 /// assert!(syriac.contains('\u{0650}')); // ARABIC KASRA 289 /// assert!(!syriac.contains('٠')); // ARABIC-INDIC DIGIT ZERO 290 /// assert!(!syriac.contains('ﷲ')); // ARABIC LIGATURE ALLAH ISOLATED FORM 291 /// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH 292 /// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH 293 /// ``` 294 #[derive(Debug)] 295 pub struct ScriptWithExtensions { 296 data: DataPayload<ScriptWithExtensionsPropertyV1>, 297 } 298 299 /// A borrowed wrapper around script extension data, returned by 300 /// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query. 301 #[derive(Clone, Copy, Debug)] 302 pub struct ScriptWithExtensionsBorrowed<'a> { 303 data: &'a ScriptWithExtensionsProperty<'a>, 304 } 305 306 impl ScriptWithExtensions { 307 /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data. 308 /// 309 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 310 /// 311 /// [ Help choosing a constructor](icu_provider::constructors) 312 #[cfg(feature = "compiled_data")] 313 #[allow(clippy::new_ret_no_self)] new() -> ScriptWithExtensionsBorrowed<'static>314 pub fn new() -> ScriptWithExtensionsBorrowed<'static> { 315 ScriptWithExtensionsBorrowed::new() 316 } 317 318 icu_provider::gen_buffer_data_constructors!( 319 () -> result: Result<ScriptWithExtensions, DataError>, 320 functions: [ 321 new: skip, 322 try_new_with_buffer_provider, 323 try_new_unstable, 324 Self, 325 ] 326 ); 327 328 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] try_new_unstable( provider: &(impl DataProvider<ScriptWithExtensionsPropertyV1> + ?Sized), ) -> Result<Self, DataError>329 pub fn try_new_unstable( 330 provider: &(impl DataProvider<ScriptWithExtensionsPropertyV1> + ?Sized), 331 ) -> Result<Self, DataError> { 332 Ok(ScriptWithExtensions::from_data( 333 provider.load(Default::default())?.payload, 334 )) 335 } 336 337 /// Construct a borrowed version of this type that can be queried. 338 /// 339 /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it 340 /// up front. 341 #[inline] as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_>342 pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> { 343 ScriptWithExtensionsBorrowed { 344 data: self.data.get(), 345 } 346 } 347 348 /// Construct a new one from loaded data 349 /// 350 /// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead from_data(data: DataPayload<ScriptWithExtensionsPropertyV1>) -> Self351 pub(crate) fn from_data(data: DataPayload<ScriptWithExtensionsPropertyV1>) -> Self { 352 Self { data } 353 } 354 } 355 356 impl<'a> ScriptWithExtensionsBorrowed<'a> { 357 /// Returns the `Script` property value for this code point. 358 /// 359 /// # Examples 360 /// 361 /// ``` 362 /// use icu::properties::script::ScriptWithExtensions; 363 /// use icu::properties::props::Script; 364 /// 365 /// let swe = ScriptWithExtensions::new(); 366 /// 367 /// // U+0640 ARABIC TATWEEL 368 /// assert_eq!(swe.get_script_val('ـ'), Script::Common); // main Script value 369 /// assert_ne!(swe.get_script_val('ـ'), Script::Arabic); 370 /// assert_ne!(swe.get_script_val('ـ'), Script::Syriac); 371 /// assert_ne!(swe.get_script_val('ـ'), Script::Thaana); 372 /// 373 /// // U+0650 ARABIC KASRA 374 /// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // main Script value 375 /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Arabic); 376 /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Syriac); 377 /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Thaana); 378 /// 379 /// // U+0660 ARABIC-INDIC DIGIT ZERO 380 /// assert_ne!(swe.get_script_val('٠'), Script::Common); 381 /// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // main Script value 382 /// assert_ne!(swe.get_script_val('٠'), Script::Syriac); 383 /// assert_ne!(swe.get_script_val('٠'), Script::Thaana); 384 /// 385 /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM 386 /// assert_ne!(swe.get_script_val('ﷲ'), Script::Common); 387 /// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // main Script value 388 /// assert_ne!(swe.get_script_val('ﷲ'), Script::Syriac); 389 /// assert_ne!(swe.get_script_val('ﷲ'), Script::Thaana); 390 /// ``` get_script_val(self, ch: char) -> Script391 pub fn get_script_val(self, ch: char) -> Script { 392 self.get_script_val32(ch as u32) 393 } 394 395 /// See [`Self::get_script_val`]. get_script_val32(self, code_point: u32) -> Script396 pub fn get_script_val32(self, code_point: u32) -> Script { 397 let sc_with_ext = self.data.trie.get32(code_point); 398 399 if sc_with_ext.is_other() { 400 let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL; 401 let scx_val = self.data.extensions.get(ext_idx as usize); 402 let scx_first_sc = scx_val.and_then(|scx| scx.get(0)); 403 404 let default_sc_val = Script::Unknown; 405 406 scx_first_sc.unwrap_or(default_sc_val) 407 } else if sc_with_ext.is_common() { 408 Script::Common 409 } else if sc_with_ext.is_inherited() { 410 Script::Inherited 411 } else { 412 let script_val = sc_with_ext.0; 413 Script(script_val) 414 } 415 } 416 // Returns the Script_Extensions value for a code_point when the trie value 417 // is already known. 418 // This private helper method exists to prevent code duplication in callers like 419 // `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`. get_scx_val_using_trie_val( self, sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE, ) -> &'a ZeroSlice<Script>420 fn get_scx_val_using_trie_val( 421 self, 422 sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE, 423 ) -> &'a ZeroSlice<Script> { 424 let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule); 425 if sc_with_ext.is_other() { 426 let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL; 427 let ext_subarray = self.data.extensions.get(ext_idx as usize); 428 // In the OTHER case, where the 2 higher-order bits of the 429 // `ScriptWithExt` value in the trie doesn't indicate the Script value, 430 // the Script value is copied/inserted into the first position of the 431 // `extensions` array. So we must remove it to return the actual scx array val. 432 let scx_slice = ext_subarray 433 .and_then(|zslice| zslice.as_ule_slice().get(1..)) 434 .unwrap_or_default(); 435 ZeroSlice::from_ule_slice(scx_slice) 436 } else if sc_with_ext.is_common() || sc_with_ext.is_inherited() { 437 let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL; 438 let scx_val = self.data.extensions.get(ext_idx as usize); 439 scx_val.unwrap_or_default() 440 } else { 441 // Note: `Script` and `ScriptWithExt` are both represented as the same 442 // u16 value when the `ScriptWithExt` has no higher-order bits set. 443 let script_ule_slice = core::slice::from_ref(sc_with_ext_ule); 444 ZeroSlice::from_ule_slice(script_ule_slice) 445 } 446 } 447 /// Return the `Script_Extensions` property value for this code point. 448 /// 449 /// If `code_point` has Script_Extensions, then return the Script codes in 450 /// the Script_Extensions. In this case, the Script property value 451 /// (normally Common or Inherited) is not included in the [`ScriptExtensionsSet`]. 452 /// 453 /// If c does not have Script_Extensions, then the one Script code is put 454 /// into the [`ScriptExtensionsSet`] and also returned. 455 /// 456 /// If c is not a valid code point, then return an empty [`ScriptExtensionsSet`]. 457 /// 458 /// # Examples 459 /// 460 /// ``` 461 /// use icu::properties::script::ScriptWithExtensions; 462 /// use icu::properties::props::Script; 463 /// 464 /// let swe = ScriptWithExtensions::new(); 465 /// 466 /// assert_eq!( 467 /// swe.get_script_extensions_val('') // U+104D0 OSAGE CAPITAL LETTER KHA 468 /// .iter() 469 /// .collect::<Vec<_>>(), 470 /// [Script::Osage] 471 /// ); 472 /// assert_eq!( 473 /// swe.get_script_extensions_val('') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT 474 /// .iter() 475 /// .collect::<Vec<_>>(), 476 /// [Script::Common] 477 /// ); 478 /// assert_eq!( 479 /// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER 480 /// .iter() 481 /// .collect::<Vec<_>>(), 482 /// [Script::Inherited] 483 /// ); 484 /// assert_eq!( 485 /// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE 486 /// .iter() 487 /// .collect::<Vec<_>>(), 488 /// [Script::Tamil, Script::Grantha] 489 /// ); 490 /// ``` get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a>491 pub fn get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a> { 492 self.get_script_extensions_val32(ch as u32) 493 } 494 495 /// See [`Self::get_script_extensions_val`]. get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a>496 pub fn get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a> { 497 let sc_with_ext_ule = self.data.trie.get32_ule(code_point); 498 499 ScriptExtensionsSet { 500 values: match sc_with_ext_ule { 501 Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref), 502 None => ZeroSlice::from_ule_slice(&[]), 503 }, 504 } 505 } 506 507 /// Returns whether `script` is contained in the Script_Extensions 508 /// property value if the code_point has Script_Extensions, otherwise 509 /// if the code point does not have Script_Extensions then returns 510 /// whether the Script property value matches. 511 /// 512 /// Some characters are commonly used in multiple scripts. For more information, 513 /// see UAX #24: <http://www.unicode.org/reports/tr24/>. 514 /// 515 /// # Examples 516 /// 517 /// ``` 518 /// use icu::properties::script::ScriptWithExtensions; 519 /// use icu::properties::props::Script; 520 /// 521 /// let swe = ScriptWithExtensions::new(); 522 /// 523 /// // U+0650 ARABIC KASRA 524 /// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value 525 /// assert!(swe.has_script('\u{0650}', Script::Arabic)); 526 /// assert!(swe.has_script('\u{0650}', Script::Syriac)); 527 /// assert!(!swe.has_script('\u{0650}', Script::Thaana)); 528 /// 529 /// // U+0660 ARABIC-INDIC DIGIT ZERO 530 /// assert!(!swe.has_script('٠', Script::Common)); // main Script value 531 /// assert!(swe.has_script('٠', Script::Arabic)); 532 /// assert!(!swe.has_script('٠', Script::Syriac)); 533 /// assert!(swe.has_script('٠', Script::Thaana)); 534 /// 535 /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM 536 /// assert!(!swe.has_script('ﷲ', Script::Common)); 537 /// assert!(swe.has_script('ﷲ', Script::Arabic)); // main Script value 538 /// assert!(!swe.has_script('ﷲ', Script::Syriac)); 539 /// assert!(swe.has_script('ﷲ', Script::Thaana)); 540 /// ``` has_script(self, ch: char, script: Script) -> bool541 pub fn has_script(self, ch: char, script: Script) -> bool { 542 self.has_script32(ch as u32, script) 543 } 544 545 /// See [`Self::has_script`]. has_script32(self, code_point: u32, script: Script) -> bool546 pub fn has_script32(self, code_point: u32, script: Script) -> bool { 547 let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) { 548 scwe_ule 549 } else { 550 return false; 551 }; 552 let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule); 553 554 if !sc_with_ext.has_extensions() { 555 let script_val = sc_with_ext.0; 556 script == Script(script_val) 557 } else { 558 let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule); 559 let script_find = scx_val.iter().find(|&sc| sc == script); 560 script_find.is_some() 561 } 562 } 563 564 /// Returns all of the matching `CodePointMapRange`s for the given [`Script`] 565 /// in which `has_script` will return true for all of the contained code points. 566 /// 567 /// # Examples 568 /// 569 /// ``` 570 /// use icu::properties::props::Script; 571 /// use icu::properties::script::ScriptWithExtensions; 572 /// 573 /// let swe = ScriptWithExtensions::new(); 574 /// 575 /// let syriac_script_extensions_ranges = 576 /// swe.get_script_extensions_ranges(Script::Syriac); 577 /// 578 /// let exp_ranges = [ 579 /// 0x0303..=0x0304, // COMBINING TILDE..COMBINING MACRON 580 /// 0x0307..=0x0308, // COMBINING DOT ABOVE..COMBINING DIAERESIS 581 /// 0x030A..=0x030A, // COMBINING RING ABOVE 582 /// 0x0320..=0x0320, // COMBINING MINUS SIGN BELOW 583 /// 0x0323..=0x0325, // COMBINING DOT BELOW..COMBINING RING BELOW 584 /// 0x032D..=0x032E, // COMBINING CIRCUMFLEX ACCENT BELOW..COMBINING BREVE BELOW 585 /// 0x0330..=0x0330, // COMBINING TILDE BELOW 586 /// 0x060C..=0x060C, // ARABIC COMMA 587 /// 0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK 588 /// 0x061F..=0x061F, // ARABIC QUESTION MARK 589 /// 0x0640..=0x0640, // ARABIC TATWEEL 590 /// 0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW 591 /// 0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF 592 /// 0x0700..=0x070D, // Syriac block begins at U+0700 593 /// 0x070F..=0x074A, // Syriac block 594 /// 0x074D..=0x074F, // Syriac block ends at U+074F 595 /// 0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F 596 /// 0x1DF8..=0x1DF8, // COMBINING DOT ABOVE LEFT 597 /// 0x1DFA..=0x1DFA, // COMBINING DOT BELOW LEFT 598 /// ]; 599 /// 600 /// assert_eq!( 601 /// syriac_script_extensions_ranges.collect::<Vec<_>>(), 602 /// exp_ranges 603 /// ); 604 /// ``` get_script_extensions_ranges( self, script: Script, ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a605 pub fn get_script_extensions_ranges( 606 self, 607 script: Script, 608 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { 609 self.data 610 .trie 611 .iter_ranges_mapped(move |value| { 612 let sc_with_ext = ScriptWithExt(value.0); 613 if sc_with_ext.has_extensions() { 614 self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned()) 615 .iter() 616 .any(|sc| sc == script) 617 } else { 618 script == sc_with_ext.into() 619 } 620 }) 621 .filter(|v| v.value) 622 .map(|v| v.range) 623 } 624 625 /// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all 626 /// code points for which `has_script` will return true. 627 /// 628 /// # Examples 629 /// 630 /// ``` 631 /// use icu::properties::script::ScriptWithExtensions; 632 /// use icu::properties::props::Script; 633 /// 634 /// let swe = ScriptWithExtensions::new(); 635 /// 636 /// let syriac = swe.get_script_extensions_set(Script::Syriac); 637 /// 638 /// assert!(!syriac.contains('؞')); // ARABIC TRIPLE DOT PUNCTUATION MARK 639 /// assert!(syriac.contains('؟')); // ARABIC QUESTION MARK 640 /// assert!(!syriac.contains('ؠ')); // ARABIC LETTER KASHMIRI YEH 641 /// 642 /// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH 643 /// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH 644 /// assert!(!syriac.contains('\u{074B}')); // unassigned 645 /// assert!(syriac.contains('ݏ')); // SYRIAC LETTER SOGDIAN FE 646 /// assert!(!syriac.contains('ݐ')); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW 647 /// 648 /// assert!(syriac.contains('\u{1DF8}')); // COMBINING DOT ABOVE LEFT 649 /// assert!(!syriac.contains('\u{1DF9}')); // COMBINING WIDE INVERTED BRIDGE BELOW 650 /// assert!(syriac.contains('\u{1DFA}')); // COMBINING DOT BELOW LEFT 651 /// assert!(!syriac.contains('\u{1DFB}')); // COMBINING DELETION MARK 652 /// ``` 653 #[cfg(feature = "alloc")] get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a>654 pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> { 655 CodePointInversionList::from_iter(self.get_script_extensions_ranges(script)) 656 } 657 } 658 659 #[cfg(feature = "compiled_data")] 660 impl Default for ScriptWithExtensionsBorrowed<'static> { default() -> Self661 fn default() -> Self { 662 Self::new() 663 } 664 } 665 666 impl ScriptWithExtensionsBorrowed<'static> { 667 /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data. 668 /// 669 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 670 /// 671 /// [ Help choosing a constructor](icu_provider::constructors) 672 #[cfg(feature = "compiled_data")] new() -> Self673 pub fn new() -> Self { 674 Self { 675 data: crate::provider::Baked::SINGLETON_SCRIPT_WITH_EXTENSIONS_PROPERTY_V1, 676 } 677 } 678 679 /// Cheaply converts a [`ScriptWithExtensionsBorrowed<'static>`] into a [`ScriptWithExtensions`]. 680 /// 681 /// Note: Due to branching and indirection, using [`ScriptWithExtensions`] might inhibit some 682 /// compile-time optimizations that are possible with [`ScriptWithExtensionsBorrowed`]. static_to_owned(self) -> ScriptWithExtensions683 pub const fn static_to_owned(self) -> ScriptWithExtensions { 684 ScriptWithExtensions { 685 data: DataPayload::from_static_ref(self.data), 686 } 687 } 688 } 689 690 #[cfg(test)] 691 mod tests { 692 use super::*; 693 #[test] 694 /// Regression test for https://github.com/unicode-org/icu4x/issues/6041 test_scx_regression_6041()695 fn test_scx_regression_6041() { 696 let scripts = ScriptWithExtensions::new() 697 .get_script_extensions_val('\u{2bc}') 698 .iter() 699 .collect::<Vec<_>>(); 700 assert_eq!( 701 scripts, 702 [ 703 Script::Bengali, 704 Script::Cyrillic, 705 Script::Devanagari, 706 Script::Latin, 707 Script::Thai, 708 Script::Lisu, 709 Script::Toto 710 ] 711 ); 712 } 713 } 714