• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! Data and APIs for supporting Script_Extensions property
6 //! values in an efficient structure.
7 
8 use crate::props::Script;
9 use crate::provider::*;
10 
11 #[cfg(feature = "alloc")]
12 use core::iter::FromIterator;
13 use core::ops::RangeInclusive;
14 #[cfg(feature = "alloc")]
15 use icu_collections::codepointinvlist::CodePointInversionList;
16 use icu_provider::prelude::*;
17 use zerovec::{ule::AsULE, ZeroSlice};
18 
19 /// The number of bits at the low-end of a `ScriptWithExt` value used for
20 /// storing the `Script` value (or `extensions` index).
21 const SCRIPT_VAL_LENGTH: u16 = 10;
22 
23 /// The bit mask necessary to retrieve the `Script` value (or `extensions` index)
24 /// from a `ScriptWithExt` value.
25 const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1;
26 
27 /// An internal-use only pseudo-property that represents the values stored in
28 /// the trie of the special data structure [`ScriptWithExtensionsProperty`].
29 ///
30 /// Note: The will assume a 12-bit layout. The 2 higher order bits in positions
31 /// 11..10 will indicate how to deduce the Script value and Script_Extensions,
32 /// and the lower 10 bits 9..0 indicate either the Script value or the index
33 /// into the `extensions` structure.
34 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
35 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
36 #[cfg_attr(feature = "datagen", derive(databake::Bake))]
37 #[cfg_attr(feature = "datagen", databake(path = icu_properties::script))]
38 #[repr(transparent)]
39 #[doc(hidden)]
40 // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
41 #[allow(clippy::exhaustive_structs)] // this type is stable
42 pub struct ScriptWithExt(pub u16);
43 
44 #[allow(missing_docs)] // These constants don't need individual documentation.
45 #[allow(non_upper_case_globals)]
46 #[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
47 impl ScriptWithExt {
48     pub const Unknown: ScriptWithExt = ScriptWithExt(0);
49 }
50 
51 impl AsULE for ScriptWithExt {
52     type ULE = <u16 as AsULE>::ULE;
53 
54     #[inline]
to_unaligned(self) -> Self::ULE55     fn to_unaligned(self) -> Self::ULE {
56         Script(self.0).to_unaligned()
57     }
58 
59     #[inline]
from_unaligned(unaligned: Self::ULE) -> Self60     fn from_unaligned(unaligned: Self::ULE) -> Self {
61         ScriptWithExt(Script::from_unaligned(unaligned).0)
62     }
63 }
64 
65 #[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
66 impl ScriptWithExt {
67     /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
68     /// also indicates a Script value of [`Script::Common`].
69     ///
70     /// # Examples
71     ///
72     /// ```
73     /// use icu::properties::script::ScriptWithExt;
74     ///
75     /// assert!(ScriptWithExt(0x04FF).is_common());
76     /// assert!(ScriptWithExt(0x0400).is_common());
77     ///
78     /// assert!(!ScriptWithExt(0x08FF).is_common());
79     /// assert!(!ScriptWithExt(0x0800).is_common());
80     ///
81     /// assert!(!ScriptWithExt(0x0CFF).is_common());
82     /// assert!(!ScriptWithExt(0x0C00).is_common());
83     ///
84     /// assert!(!ScriptWithExt(0xFF).is_common());
85     /// assert!(!ScriptWithExt(0x0).is_common());
86     /// ```
is_common(&self) -> bool87     pub fn is_common(&self) -> bool {
88         self.0 >> SCRIPT_VAL_LENGTH == 1
89     }
90 
91     /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
92     /// also indicates a Script value of [`Script::Inherited`].
93     ///
94     /// # Examples
95     ///
96     /// ```
97     /// use icu::properties::script::ScriptWithExt;
98     ///
99     /// assert!(!ScriptWithExt(0x04FF).is_inherited());
100     /// assert!(!ScriptWithExt(0x0400).is_inherited());
101     ///
102     /// assert!(ScriptWithExt(0x08FF).is_inherited());
103     /// assert!(ScriptWithExt(0x0800).is_inherited());
104     ///
105     /// assert!(!ScriptWithExt(0x0CFF).is_inherited());
106     /// assert!(!ScriptWithExt(0x0C00).is_inherited());
107     ///
108     /// assert!(!ScriptWithExt(0xFF).is_inherited());
109     /// assert!(!ScriptWithExt(0x0).is_inherited());
110     /// ```
is_inherited(&self) -> bool111     pub fn is_inherited(&self) -> bool {
112         self.0 >> SCRIPT_VAL_LENGTH == 2
113     }
114 
115     /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
116     /// also indicates that the Script value is neither [`Script::Common`] nor
117     /// [`Script::Inherited`].
118     ///
119     /// # Examples
120     ///
121     /// ```
122     /// use icu::properties::script::ScriptWithExt;
123     ///
124     /// assert!(!ScriptWithExt(0x04FF).is_other());
125     /// assert!(!ScriptWithExt(0x0400).is_other());
126     ///
127     /// assert!(!ScriptWithExt(0x08FF).is_other());
128     /// assert!(!ScriptWithExt(0x0800).is_other());
129     ///
130     /// assert!(ScriptWithExt(0x0CFF).is_other());
131     /// assert!(ScriptWithExt(0x0C00).is_other());
132     ///
133     /// assert!(!ScriptWithExt(0xFF).is_other());
134     /// assert!(!ScriptWithExt(0x0).is_other());
135     /// ```
is_other(&self) -> bool136     pub fn is_other(&self) -> bool {
137         self.0 >> SCRIPT_VAL_LENGTH == 3
138     }
139 
140     /// Returns whether the [`ScriptWithExt`] value has Script_Extensions.
141     ///
142     /// # Examples
143     ///
144     /// ```
145     /// use icu::properties::script::ScriptWithExt;
146     ///
147     /// assert!(ScriptWithExt(0x04FF).has_extensions());
148     /// assert!(ScriptWithExt(0x0400).has_extensions());
149     ///
150     /// assert!(ScriptWithExt(0x08FF).has_extensions());
151     /// assert!(ScriptWithExt(0x0800).has_extensions());
152     ///
153     /// assert!(ScriptWithExt(0x0CFF).has_extensions());
154     /// assert!(ScriptWithExt(0x0C00).has_extensions());
155     ///
156     /// assert!(!ScriptWithExt(0xFF).has_extensions());
157     /// assert!(!ScriptWithExt(0x0).has_extensions());
158     /// ```
has_extensions(&self) -> bool159     pub fn has_extensions(&self) -> bool {
160         let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH;
161         high_order_bits > 0
162     }
163 }
164 
165 impl From<ScriptWithExt> for u32 {
from(swe: ScriptWithExt) -> Self166     fn from(swe: ScriptWithExt) -> Self {
167         swe.0 as u32
168     }
169 }
170 
171 impl From<ScriptWithExt> for Script {
from(swe: ScriptWithExt) -> Self172     fn from(swe: ScriptWithExt) -> Self {
173         Script(swe.0)
174     }
175 }
176 
177 /// A struct that wraps a [`Script`] array, such as in the return value for
178 /// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val).
179 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
180 pub struct ScriptExtensionsSet<'a> {
181     values: &'a ZeroSlice<Script>,
182 }
183 
184 impl<'a> ScriptExtensionsSet<'a> {
185     /// Returns whether this set contains the given script.
186     ///
187     /// # Example
188     ///
189     /// ```
190     /// use icu::properties::props::Script;
191     /// use icu::properties::script::ScriptWithExtensions;
192     /// let swe = ScriptWithExtensions::new();
193     ///
194     /// assert!(swe
195     ///     .get_script_extensions_val('\u{11303}') // GRANTHA SIGN VISARGA
196     ///     .contains(&Script::Grantha));
197     /// ```
contains(&self, x: &Script) -> bool198     pub fn contains(&self, x: &Script) -> bool {
199         ZeroSlice::binary_search(self.values, x).is_ok()
200     }
201 
202     /// Gets an iterator over the elements.
203     ///
204     /// # Example
205     ///
206     /// ```
207     /// use icu::properties::props::Script;
208     /// use icu::properties::script::ScriptWithExtensions;
209     /// let swe = ScriptWithExtensions::new();
210     ///
211     /// assert_eq!(
212     ///     swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
213     ///         .iter()
214     ///         .collect::<Vec<_>>(),
215     ///     [Script::Tamil, Script::Grantha]
216     /// );
217     /// ```
iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a218     pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a {
219         ZeroSlice::iter(self.values)
220     }
221 
222     /// For accessing this set as an array instead of an iterator
223     #[doc(hidden)] // used by FFI code
array_len(&self) -> usize224     pub fn array_len(&self) -> usize {
225         self.values.len()
226     }
227     /// For accessing this set as an array instead of an iterator
228     #[doc(hidden)] // used by FFI code
array_get(&self, index: usize) -> Option<Script>229     pub fn array_get(&self, index: usize) -> Option<Script> {
230         self.values.get(index)
231     }
232 }
233 
234 /// A struct that represents the data for the Script and Script_Extensions properties.
235 ///
236 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
237 ///
238 /// [�� Help choosing a constructor](icu_provider::constructors)
239 ///
240 /// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`]
241 ///
242 /// # Examples
243 ///
244 /// ```
245 /// use icu::properties::script::ScriptWithExtensions;
246 /// use icu::properties::props::Script;
247 /// let swe = ScriptWithExtensions::new();
248 ///
249 /// // get the `Script` property value
250 /// assert_eq!(swe.get_script_val('ـ'), Script::Common); // U+0640 ARABIC TATWEEL
251 /// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // U+0650 ARABIC KASRA
252 /// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO
253 /// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
254 ///
255 /// // get the `Script_Extensions` property value
256 /// assert_eq!(
257 ///     swe.get_script_extensions_val('ـ') // U+0640 ARABIC TATWEEL
258 ///         .iter().collect::<Vec<_>>(),
259 ///     [Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean,
260 ///          Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian,
261 ///          Script::OldUyghur]
262 /// );
263 /// assert_eq!(
264 ///     swe.get_script_extensions_val('��') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
265 ///         .iter().collect::<Vec<_>>(),
266 ///     [Script::Common]
267 /// );
268 /// assert_eq!(
269 ///     swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
270 ///         .iter().collect::<Vec<_>>(),
271 ///     [Script::Inherited]
272 /// );
273 /// assert_eq!(
274 ///     swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
275 ///         .iter().collect::<Vec<_>>(),
276 ///     [Script::Tamil, Script::Grantha]
277 /// );
278 ///
279 /// // check containment of a `Script` value in the `Script_Extensions` value
280 /// // U+0650 ARABIC KASRA
281 /// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
282 /// assert!(swe.has_script('\u{0650}', Script::Arabic));
283 /// assert!(swe.has_script('\u{0650}', Script::Syriac));
284 /// assert!(!swe.has_script('\u{0650}', Script::Thaana));
285 ///
286 /// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value
287 /// let syriac = swe.get_script_extensions_set(Script::Syriac);
288 /// assert!(syriac.contains('\u{0650}')); // ARABIC KASRA
289 /// assert!(!syriac.contains('٠')); // ARABIC-INDIC DIGIT ZERO
290 /// assert!(!syriac.contains('ﷲ')); // ARABIC LIGATURE ALLAH ISOLATED FORM
291 /// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
292 /// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
293 /// ```
294 #[derive(Debug)]
295 pub struct ScriptWithExtensions {
296     data: DataPayload<ScriptWithExtensionsPropertyV1>,
297 }
298 
299 /// A borrowed wrapper around script extension data, returned by
300 /// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query.
301 #[derive(Clone, Copy, Debug)]
302 pub struct ScriptWithExtensionsBorrowed<'a> {
303     data: &'a ScriptWithExtensionsProperty<'a>,
304 }
305 
306 impl ScriptWithExtensions {
307     /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
308     ///
309     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
310     ///
311     /// [�� Help choosing a constructor](icu_provider::constructors)
312     #[cfg(feature = "compiled_data")]
313     #[allow(clippy::new_ret_no_self)]
new() -> ScriptWithExtensionsBorrowed<'static>314     pub fn new() -> ScriptWithExtensionsBorrowed<'static> {
315         ScriptWithExtensionsBorrowed::new()
316     }
317 
318     icu_provider::gen_buffer_data_constructors!(
319         () -> result: Result<ScriptWithExtensions, DataError>,
320         functions: [
321             new: skip,
322                         try_new_with_buffer_provider,
323             try_new_unstable,
324             Self,
325         ]
326     );
327 
328     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
try_new_unstable( provider: &(impl DataProvider<ScriptWithExtensionsPropertyV1> + ?Sized), ) -> Result<Self, DataError>329     pub fn try_new_unstable(
330         provider: &(impl DataProvider<ScriptWithExtensionsPropertyV1> + ?Sized),
331     ) -> Result<Self, DataError> {
332         Ok(ScriptWithExtensions::from_data(
333             provider.load(Default::default())?.payload,
334         ))
335     }
336 
337     /// Construct a borrowed version of this type that can be queried.
338     ///
339     /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
340     /// up front.
341     #[inline]
as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_>342     pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> {
343         ScriptWithExtensionsBorrowed {
344             data: self.data.get(),
345         }
346     }
347 
348     /// Construct a new one from loaded data
349     ///
350     /// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead
from_data(data: DataPayload<ScriptWithExtensionsPropertyV1>) -> Self351     pub(crate) fn from_data(data: DataPayload<ScriptWithExtensionsPropertyV1>) -> Self {
352         Self { data }
353     }
354 }
355 
356 impl<'a> ScriptWithExtensionsBorrowed<'a> {
357     /// Returns the `Script` property value for this code point.
358     ///
359     /// # Examples
360     ///
361     /// ```
362     /// use icu::properties::script::ScriptWithExtensions;
363     /// use icu::properties::props::Script;
364     ///
365     /// let swe = ScriptWithExtensions::new();
366     ///
367     /// // U+0640 ARABIC TATWEEL
368     /// assert_eq!(swe.get_script_val('ـ'), Script::Common); // main Script value
369     /// assert_ne!(swe.get_script_val('ـ'), Script::Arabic);
370     /// assert_ne!(swe.get_script_val('ـ'), Script::Syriac);
371     /// assert_ne!(swe.get_script_val('ـ'), Script::Thaana);
372     ///
373     /// // U+0650 ARABIC KASRA
374     /// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // main Script value
375     /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Arabic);
376     /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Syriac);
377     /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Thaana);
378     ///
379     /// // U+0660 ARABIC-INDIC DIGIT ZERO
380     /// assert_ne!(swe.get_script_val('٠'), Script::Common);
381     /// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // main Script value
382     /// assert_ne!(swe.get_script_val('٠'), Script::Syriac);
383     /// assert_ne!(swe.get_script_val('٠'), Script::Thaana);
384     ///
385     /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
386     /// assert_ne!(swe.get_script_val('ﷲ'), Script::Common);
387     /// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // main Script value
388     /// assert_ne!(swe.get_script_val('ﷲ'), Script::Syriac);
389     /// assert_ne!(swe.get_script_val('ﷲ'), Script::Thaana);
390     /// ```
get_script_val(self, ch: char) -> Script391     pub fn get_script_val(self, ch: char) -> Script {
392         self.get_script_val32(ch as u32)
393     }
394 
395     /// See [`Self::get_script_val`].
get_script_val32(self, code_point: u32) -> Script396     pub fn get_script_val32(self, code_point: u32) -> Script {
397         let sc_with_ext = self.data.trie.get32(code_point);
398 
399         if sc_with_ext.is_other() {
400             let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
401             let scx_val = self.data.extensions.get(ext_idx as usize);
402             let scx_first_sc = scx_val.and_then(|scx| scx.get(0));
403 
404             let default_sc_val = Script::Unknown;
405 
406             scx_first_sc.unwrap_or(default_sc_val)
407         } else if sc_with_ext.is_common() {
408             Script::Common
409         } else if sc_with_ext.is_inherited() {
410             Script::Inherited
411         } else {
412             let script_val = sc_with_ext.0;
413             Script(script_val)
414         }
415     }
416     // Returns the Script_Extensions value for a code_point when the trie value
417     // is already known.
418     // This private helper method exists to prevent code duplication in callers like
419     // `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`.
get_scx_val_using_trie_val( self, sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE, ) -> &'a ZeroSlice<Script>420     fn get_scx_val_using_trie_val(
421         self,
422         sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE,
423     ) -> &'a ZeroSlice<Script> {
424         let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule);
425         if sc_with_ext.is_other() {
426             let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
427             let ext_subarray = self.data.extensions.get(ext_idx as usize);
428             // In the OTHER case, where the 2 higher-order bits of the
429             // `ScriptWithExt` value in the trie doesn't indicate the Script value,
430             // the Script value is copied/inserted into the first position of the
431             // `extensions` array. So we must remove it to return the actual scx array val.
432             let scx_slice = ext_subarray
433                 .and_then(|zslice| zslice.as_ule_slice().get(1..))
434                 .unwrap_or_default();
435             ZeroSlice::from_ule_slice(scx_slice)
436         } else if sc_with_ext.is_common() || sc_with_ext.is_inherited() {
437             let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
438             let scx_val = self.data.extensions.get(ext_idx as usize);
439             scx_val.unwrap_or_default()
440         } else {
441             // Note: `Script` and `ScriptWithExt` are both represented as the same
442             // u16 value when the `ScriptWithExt` has no higher-order bits set.
443             let script_ule_slice = core::slice::from_ref(sc_with_ext_ule);
444             ZeroSlice::from_ule_slice(script_ule_slice)
445         }
446     }
447     /// Return the `Script_Extensions` property value for this code point.
448     ///
449     /// If `code_point` has Script_Extensions, then return the Script codes in
450     /// the Script_Extensions. In this case, the Script property value
451     /// (normally Common or Inherited) is not included in the [`ScriptExtensionsSet`].
452     ///
453     /// If c does not have Script_Extensions, then the one Script code is put
454     /// into the [`ScriptExtensionsSet`] and also returned.
455     ///
456     /// If c is not a valid code point, then return an empty [`ScriptExtensionsSet`].
457     ///
458     /// # Examples
459     ///
460     /// ```
461     /// use icu::properties::script::ScriptWithExtensions;
462     /// use icu::properties::props::Script;
463     ///
464     /// let swe = ScriptWithExtensions::new();
465     ///
466     /// assert_eq!(
467     ///     swe.get_script_extensions_val('��') // U+104D0 OSAGE CAPITAL LETTER KHA
468     ///         .iter()
469     ///         .collect::<Vec<_>>(),
470     ///     [Script::Osage]
471     /// );
472     /// assert_eq!(
473     ///     swe.get_script_extensions_val('��') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
474     ///         .iter()
475     ///         .collect::<Vec<_>>(),
476     ///     [Script::Common]
477     /// );
478     /// assert_eq!(
479     ///     swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
480     ///         .iter()
481     ///         .collect::<Vec<_>>(),
482     ///     [Script::Inherited]
483     /// );
484     /// assert_eq!(
485     ///     swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
486     ///         .iter()
487     ///         .collect::<Vec<_>>(),
488     ///     [Script::Tamil, Script::Grantha]
489     /// );
490     /// ```
get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a>491     pub fn get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a> {
492         self.get_script_extensions_val32(ch as u32)
493     }
494 
495     /// See [`Self::get_script_extensions_val`].
get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a>496     pub fn get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a> {
497         let sc_with_ext_ule = self.data.trie.get32_ule(code_point);
498 
499         ScriptExtensionsSet {
500             values: match sc_with_ext_ule {
501                 Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref),
502                 None => ZeroSlice::from_ule_slice(&[]),
503             },
504         }
505     }
506 
507     /// Returns whether `script` is contained in the Script_Extensions
508     /// property value if the code_point has Script_Extensions, otherwise
509     /// if the code point does not have Script_Extensions then returns
510     /// whether the Script property value matches.
511     ///
512     /// Some characters are commonly used in multiple scripts. For more information,
513     /// see UAX #24: <http://www.unicode.org/reports/tr24/>.
514     ///
515     /// # Examples
516     ///
517     /// ```
518     /// use icu::properties::script::ScriptWithExtensions;
519     /// use icu::properties::props::Script;
520     ///
521     /// let swe = ScriptWithExtensions::new();
522     ///
523     /// // U+0650 ARABIC KASRA
524     /// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
525     /// assert!(swe.has_script('\u{0650}', Script::Arabic));
526     /// assert!(swe.has_script('\u{0650}', Script::Syriac));
527     /// assert!(!swe.has_script('\u{0650}', Script::Thaana));
528     ///
529     /// // U+0660 ARABIC-INDIC DIGIT ZERO
530     /// assert!(!swe.has_script('٠', Script::Common)); // main Script value
531     /// assert!(swe.has_script('٠', Script::Arabic));
532     /// assert!(!swe.has_script('٠', Script::Syriac));
533     /// assert!(swe.has_script('٠', Script::Thaana));
534     ///
535     /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
536     /// assert!(!swe.has_script('ﷲ', Script::Common));
537     /// assert!(swe.has_script('ﷲ', Script::Arabic)); // main Script value
538     /// assert!(!swe.has_script('ﷲ', Script::Syriac));
539     /// assert!(swe.has_script('ﷲ', Script::Thaana));
540     /// ```
has_script(self, ch: char, script: Script) -> bool541     pub fn has_script(self, ch: char, script: Script) -> bool {
542         self.has_script32(ch as u32, script)
543     }
544 
545     /// See [`Self::has_script`].
has_script32(self, code_point: u32, script: Script) -> bool546     pub fn has_script32(self, code_point: u32, script: Script) -> bool {
547         let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) {
548             scwe_ule
549         } else {
550             return false;
551         };
552         let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule);
553 
554         if !sc_with_ext.has_extensions() {
555             let script_val = sc_with_ext.0;
556             script == Script(script_val)
557         } else {
558             let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule);
559             let script_find = scx_val.iter().find(|&sc| sc == script);
560             script_find.is_some()
561         }
562     }
563 
564     /// Returns all of the matching `CodePointMapRange`s for the given [`Script`]
565     /// in which `has_script` will return true for all of the contained code points.
566     ///
567     /// # Examples
568     ///
569     /// ```
570     /// use icu::properties::props::Script;
571     /// use icu::properties::script::ScriptWithExtensions;
572     ///
573     /// let swe = ScriptWithExtensions::new();
574     ///
575     /// let syriac_script_extensions_ranges =
576     ///     swe.get_script_extensions_ranges(Script::Syriac);
577     ///
578     /// let exp_ranges = [
579     ///     0x0303..=0x0304, // COMBINING TILDE..COMBINING MACRON
580     ///     0x0307..=0x0308, // COMBINING DOT ABOVE..COMBINING DIAERESIS
581     ///     0x030A..=0x030A, // COMBINING RING ABOVE
582     ///     0x0320..=0x0320, // COMBINING MINUS SIGN BELOW
583     ///     0x0323..=0x0325, // COMBINING DOT BELOW..COMBINING RING BELOW
584     ///     0x032D..=0x032E, // COMBINING CIRCUMFLEX ACCENT BELOW..COMBINING BREVE BELOW
585     ///     0x0330..=0x0330, // COMBINING TILDE BELOW
586     ///     0x060C..=0x060C, // ARABIC COMMA
587     ///     0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK
588     ///     0x061F..=0x061F, // ARABIC QUESTION MARK
589     ///     0x0640..=0x0640, // ARABIC TATWEEL
590     ///     0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW
591     ///     0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF
592     ///     0x0700..=0x070D, // Syriac block begins at U+0700
593     ///     0x070F..=0x074A, // Syriac block
594     ///     0x074D..=0x074F, // Syriac block ends at U+074F
595     ///     0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F
596     ///     0x1DF8..=0x1DF8, // COMBINING DOT ABOVE LEFT
597     ///     0x1DFA..=0x1DFA, // COMBINING DOT BELOW LEFT
598     /// ];
599     ///
600     /// assert_eq!(
601     ///     syriac_script_extensions_ranges.collect::<Vec<_>>(),
602     ///     exp_ranges
603     /// );
604     /// ```
get_script_extensions_ranges( self, script: Script, ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a605     pub fn get_script_extensions_ranges(
606         self,
607         script: Script,
608     ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
609         self.data
610             .trie
611             .iter_ranges_mapped(move |value| {
612                 let sc_with_ext = ScriptWithExt(value.0);
613                 if sc_with_ext.has_extensions() {
614                     self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned())
615                         .iter()
616                         .any(|sc| sc == script)
617                 } else {
618                     script == sc_with_ext.into()
619                 }
620             })
621             .filter(|v| v.value)
622             .map(|v| v.range)
623     }
624 
625     /// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all
626     /// code points for which `has_script` will return true.
627     ///
628     /// # Examples
629     ///
630     /// ```
631     /// use icu::properties::script::ScriptWithExtensions;
632     /// use icu::properties::props::Script;
633     ///
634     /// let swe = ScriptWithExtensions::new();
635     ///
636     /// let syriac = swe.get_script_extensions_set(Script::Syriac);
637     ///
638     /// assert!(!syriac.contains('؞')); // ARABIC TRIPLE DOT PUNCTUATION MARK
639     /// assert!(syriac.contains('؟')); // ARABIC QUESTION MARK
640     /// assert!(!syriac.contains('ؠ')); // ARABIC LETTER KASHMIRI YEH
641     ///
642     /// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
643     /// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
644     /// assert!(!syriac.contains('\u{074B}')); // unassigned
645     /// assert!(syriac.contains('ݏ')); // SYRIAC LETTER SOGDIAN FE
646     /// assert!(!syriac.contains('ݐ')); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW
647     ///
648     /// assert!(syriac.contains('\u{1DF8}')); // COMBINING DOT ABOVE LEFT
649     /// assert!(!syriac.contains('\u{1DF9}')); // COMBINING WIDE INVERTED BRIDGE BELOW
650     /// assert!(syriac.contains('\u{1DFA}')); // COMBINING DOT BELOW LEFT
651     /// assert!(!syriac.contains('\u{1DFB}')); // COMBINING DELETION MARK
652     /// ```
653     #[cfg(feature = "alloc")]
get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a>654     pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> {
655         CodePointInversionList::from_iter(self.get_script_extensions_ranges(script))
656     }
657 }
658 
659 #[cfg(feature = "compiled_data")]
660 impl Default for ScriptWithExtensionsBorrowed<'static> {
default() -> Self661     fn default() -> Self {
662         Self::new()
663     }
664 }
665 
666 impl ScriptWithExtensionsBorrowed<'static> {
667     /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
668     ///
669     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
670     ///
671     /// [�� Help choosing a constructor](icu_provider::constructors)
672     #[cfg(feature = "compiled_data")]
new() -> Self673     pub fn new() -> Self {
674         Self {
675             data: crate::provider::Baked::SINGLETON_SCRIPT_WITH_EXTENSIONS_PROPERTY_V1,
676         }
677     }
678 
679     /// Cheaply converts a [`ScriptWithExtensionsBorrowed<'static>`] into a [`ScriptWithExtensions`].
680     ///
681     /// Note: Due to branching and indirection, using [`ScriptWithExtensions`] might inhibit some
682     /// compile-time optimizations that are possible with [`ScriptWithExtensionsBorrowed`].
static_to_owned(self) -> ScriptWithExtensions683     pub const fn static_to_owned(self) -> ScriptWithExtensions {
684         ScriptWithExtensions {
685             data: DataPayload::from_static_ref(self.data),
686         }
687     }
688 }
689 
690 #[cfg(test)]
691 mod tests {
692     use super::*;
693     #[test]
694     /// Regression test for https://github.com/unicode-org/icu4x/issues/6041
test_scx_regression_6041()695     fn test_scx_regression_6041() {
696         let scripts = ScriptWithExtensions::new()
697             .get_script_extensions_val('\u{2bc}')
698             .iter()
699             .collect::<Vec<_>>();
700         assert_eq!(
701             scripts,
702             [
703                 Script::Bengali,
704                 Script::Cyrillic,
705                 Script::Devanagari,
706                 Script::Latin,
707                 Script::Thai,
708                 Script::Lisu,
709                 Script::Toto
710             ]
711         );
712     }
713 }
714