• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 #[cfg(feature = "serde")]
6 use alloc::format;
7 #[cfg(feature = "serde")]
8 use alloc::string::String;
9 #[cfg(feature = "alloc")]
10 use alloc::vec::Vec;
11 use core::{char, ops::RangeBounds, ops::RangeInclusive};
12 use potential_utf::PotentialCodePoint;
13 use yoke::Yokeable;
14 use zerofrom::ZeroFrom;
15 use zerovec::{ule::AsULE, zerovec, ZeroVec};
16 
17 use super::InvalidSetError;
18 use crate::codepointinvlist::utils::{deconstruct_range, is_valid_zv};
19 
20 /// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0, inclusive
21 const BMP_MAX: u32 = 0xFFFF;
22 
23 /// Represents the inversion list for a set of all code points in the Basic Multilingual Plane.
24 const BMP_INV_LIST_VEC: ZeroVec<PotentialCodePoint> = zerovec!(PotentialCodePoint; PotentialCodePoint::to_unaligned; [PotentialCodePoint::from_u24(0x0), PotentialCodePoint::from_u24(BMP_MAX + 1)]);
25 
26 /// Represents the inversion list for all of the code points in the Unicode range.
27 const ALL_VEC: ZeroVec<PotentialCodePoint> = zerovec!(PotentialCodePoint; PotentialCodePoint::to_unaligned; [PotentialCodePoint::from_u24(0x0), PotentialCodePoint::from_u24((char::MAX as u32) + 1)]);
28 
29 /// A membership wrapper for [`CodePointInversionList`].
30 ///
31 /// Provides exposure to membership functions and constructors from serialized `CodePointSet`s (sets of code points)
32 /// and predefined ranges.
33 #[zerovec::make_varule(CodePointInversionListULE)]
34 #[zerovec::skip_derive(Ord)]
35 #[zerovec::derive(Debug)]
36 #[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
37 #[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))]
38 pub struct CodePointInversionList<'data> {
39     // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature
40     // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html
41     // Allows for traits of fixed size arrays
42 
43     // Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list)
44     inv_list: ZeroVec<'data, PotentialCodePoint>,
45     size: u32,
46 }
47 
48 #[cfg(feature = "serde")]
49 impl<'de: 'a, 'a> serde::Deserialize<'de> for CodePointInversionList<'a> {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,50     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
51     where
52         D: serde::Deserializer<'de>,
53     {
54         use serde::de::Error;
55 
56         let parsed_inv_list = if deserializer.is_human_readable() {
57             let parsed_strings = Vec::<alloc::borrow::Cow<'de, str>>::deserialize(deserializer)?;
58             let mut inv_list = ZeroVec::new_owned(Vec::with_capacity(parsed_strings.len() * 2));
59             for range in parsed_strings {
60                 fn internal(range: &str) -> Option<(u32, u32)> {
61                     let (start, range) = UnicodeCodePoint::parse(range)?;
62                     if range.is_empty() {
63                         return Some((start.0, start.0));
64                     }
65                     let (hyphen, range) = UnicodeCodePoint::parse(range)?;
66                     if hyphen.0 != '-' as u32 {
67                         return None;
68                     }
69                     let (end, range) = UnicodeCodePoint::parse(range)?;
70                     range.is_empty().then_some((start.0, end.0))
71                 }
72                 let (start, end) = internal(&range).ok_or_else(|| Error::custom(format!(
73                     "Cannot deserialize invalid inversion list for CodePointInversionList: {range:?}"
74                 )))?;
75                 inv_list.with_mut(|v| {
76                     v.push(PotentialCodePoint::from_u24(start).to_unaligned());
77                     v.push(PotentialCodePoint::from_u24(end + 1).to_unaligned());
78                 });
79             }
80             inv_list
81         } else {
82             ZeroVec::<PotentialCodePoint>::deserialize(deserializer)?
83         };
84         CodePointInversionList::try_from_inversion_list(parsed_inv_list).map_err(|e| {
85             Error::custom(format!(
86                 "Cannot deserialize invalid inversion list for CodePointInversionList: {e:?}"
87             ))
88         })
89     }
90 }
91 
92 #[cfg(feature = "databake")]
93 impl databake::Bake for CodePointInversionList<'_> {
bake(&self, env: &databake::CrateEnv) -> databake::TokenStream94     fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
95         env.insert("icu_collections");
96         let inv_list = self.inv_list.bake(env);
97         let size = self.size.bake(env);
98         // Safe because our parts are safe.
99         databake::quote! { unsafe {
100             #[allow(unused_unsafe)]
101             icu_collections::codepointinvlist::CodePointInversionList::from_parts_unchecked(#inv_list, #size)
102         }}
103     }
104 }
105 
106 #[cfg(feature = "databake")]
107 impl databake::BakeSize for CodePointInversionList<'_> {
borrows_size(&self) -> usize108     fn borrows_size(&self) -> usize {
109         self.inv_list.borrows_size()
110     }
111 }
112 
113 #[cfg(feature = "serde")]
114 #[derive(Debug, Copy, Clone)]
115 struct UnicodeCodePoint(u32);
116 
117 #[cfg(feature = "serde")]
118 impl UnicodeCodePoint {
from_u32(cp: u32) -> Result<Self, String>119     fn from_u32(cp: u32) -> Result<Self, String> {
120         if cp <= char::MAX as u32 {
121             Ok(Self(cp))
122         } else {
123             Err(format!("Not a Unicode code point {}", cp))
124         }
125     }
126 
parse(value: &str) -> Option<(Self, &str)>127     fn parse(value: &str) -> Option<(Self, &str)> {
128         Some(if let Some(hex) = value.strip_prefix("U+") {
129             let (escape, remainder) = (hex.get(..4)?, hex.get(4..)?);
130             (Self(u32::from_str_radix(escape, 16).ok()?), remainder)
131         } else {
132             let c = value.chars().next()?;
133             (Self(c as u32), value.get(c.len_utf8()..)?)
134         })
135     }
136 }
137 
138 #[cfg(feature = "serde")]
139 impl core::fmt::Display for UnicodeCodePoint {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result140     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
141         match self.0 {
142             s @ 0xD800..=0xDFFF => write!(f, "U+{s:X}"),
143             // char should be in range by construction but this code is not so performance-sensitive
144             // so we just use the replacement character
145             c => write!(
146                 f,
147                 "{}",
148                 char::from_u32(c).unwrap_or(char::REPLACEMENT_CHARACTER)
149             ),
150         }
151     }
152 }
153 
154 #[cfg(feature = "serde")]
155 impl serde::Serialize for CodePointInversionList<'_> {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,156     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
157     where
158         S: serde::Serializer,
159     {
160         if serializer.is_human_readable() {
161             use serde::ser::Error;
162             use serde::ser::SerializeSeq;
163             let mut seq = serializer.serialize_seq(Some(self.inv_list.len() / 2))?;
164             for range in self.iter_ranges() {
165                 let start = UnicodeCodePoint::from_u32(*range.start()).map_err(S::Error::custom)?;
166                 if range.start() == range.end() {
167                     seq.serialize_element(&format!("{start}"))?;
168                 } else {
169                     let end = UnicodeCodePoint::from_u32(*range.end()).map_err(S::Error::custom)?;
170                     seq.serialize_element(&format!("{start}-{end}",))?;
171                 }
172             }
173             seq.end()
174         } else {
175             // Note: serde(flatten) currently does not promote a struct field of type Vec
176             // to replace the struct when serializing. The error message from the default
177             // serialization is: "can only flatten structs and maps (got a sequence)".
178             self.inv_list.serialize(serializer)
179         }
180     }
181 }
182 
183 impl<'data> CodePointInversionList<'data> {
184     /// Returns a new [`CodePointInversionList`] from an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
185     /// represented as a [`ZeroVec`]`<`[`PotentialCodePoint`]`>` of code points.
186     ///
187     /// The inversion list must be of even length, sorted ascending non-overlapping,
188     /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
189     ///
190     /// # Examples
191     ///
192     /// ```
193     /// use icu::collections::codepointinvlist::CodePointInversionList;
194     /// use icu::collections::codepointinvlist::InvalidSetError;
195     /// use potential_utf::PotentialCodePoint;
196     /// use zerovec::ZeroVec;
197     ///
198     /// let valid = [0x0, 0x10000];
199     /// let inv_list: ZeroVec<PotentialCodePoint> = valid
200     ///     .into_iter()
201     ///     .map(PotentialCodePoint::from_u24)
202     ///     .collect();
203     /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
204     /// assert!(matches!(result, CodePointInversionList));
205     ///
206     /// let invalid = vec![0x0, 0x80, 0x3];
207     /// let inv_list: ZeroVec<PotentialCodePoint> = invalid
208     ///     .iter()
209     ///     .copied()
210     ///     .map(PotentialCodePoint::from_u24)
211     ///     .collect();
212     /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
213     /// assert!(matches!(result, Err(InvalidSetError(_))));
214     /// if let Err(InvalidSetError(actual)) = result {
215     ///     assert_eq!(
216     ///         &invalid,
217     ///         &actual.into_iter().map(u32::from).collect::<Vec<_>>()
218     ///     );
219     /// }
220     /// ```
try_from_inversion_list( inv_list: ZeroVec<'data, PotentialCodePoint>, ) -> Result<Self, InvalidSetError>221     pub fn try_from_inversion_list(
222         inv_list: ZeroVec<'data, PotentialCodePoint>,
223     ) -> Result<Self, InvalidSetError> {
224         #[allow(clippy::indexing_slicing)] // chunks
225         if is_valid_zv(&inv_list) {
226             let size = inv_list
227                 .as_ule_slice()
228                 .chunks(2)
229                 .map(|end_points| {
230                     u32::from(<PotentialCodePoint as AsULE>::from_unaligned(end_points[1]))
231                         - u32::from(<PotentialCodePoint as AsULE>::from_unaligned(end_points[0]))
232                 })
233                 .sum::<u32>();
234             Ok(Self { inv_list, size })
235         } else {
236             Err(InvalidSetError(
237                 #[cfg(feature = "alloc")]
238                 inv_list.to_vec(),
239             ))
240         }
241     }
242 
243     /// Safety: no actual safety invariants, however has correctness invariants
244     #[doc(hidden)] // databake internal
from_parts_unchecked( inv_list: ZeroVec<'data, PotentialCodePoint>, size: u32, ) -> Self245     pub const unsafe fn from_parts_unchecked(
246         inv_list: ZeroVec<'data, PotentialCodePoint>,
247         size: u32,
248     ) -> Self {
249         Self { inv_list, size }
250     }
251 
252     /// Returns a new, fully-owned [`CodePointInversionList`] by cloning an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
253     /// represented as a slice of [`PotentialCodePoint`] code points.
254     ///
255     /// The inversion list must be of even length, sorted ascending non-overlapping,
256     /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
257     ///
258     /// # Examples
259     ///
260     /// ```
261     /// use icu::collections::codepointinvlist::CodePointInversionList;
262     ///
263     /// let bmp_list = &[0x0, 0x10000];
264     /// let smp_list = &[0x10000, 0x20000];
265     /// let sip_list = &[0x20000, 0x30000];
266     ///
267     /// let lists: Vec<CodePointInversionList> =
268     ///     [&bmp_list[..], smp_list, sip_list]
269     ///         .into_iter()
270     ///         .map(|l| {
271     ///             CodePointInversionList::try_from_u32_inversion_list_slice(l)
272     ///                 .unwrap()
273     ///         })
274     ///         .collect();
275     ///
276     /// let bmp = &lists[0];
277     /// assert!(bmp.contains32(0xFFFF));
278     /// assert!(!bmp.contains32(0x10000));
279     ///
280     /// assert!(!lists.iter().any(|set| set.contains32(0x40000)));
281     /// ```
282     #[cfg(feature = "alloc")]
try_from_u32_inversion_list_slice(inv_list: &[u32]) -> Result<Self, InvalidSetError>283     pub fn try_from_u32_inversion_list_slice(inv_list: &[u32]) -> Result<Self, InvalidSetError> {
284         let inv_list_zv: ZeroVec<PotentialCodePoint> = inv_list
285             .iter()
286             .copied()
287             .map(PotentialCodePoint::from_u24)
288             .collect();
289         CodePointInversionList::try_from_inversion_list(inv_list_zv)
290     }
291 
292     /// Attempts to convert this list into a fully-owned one. No-op if already fully owned
293     #[cfg(feature = "alloc")]
into_owned(self) -> CodePointInversionList<'static>294     pub fn into_owned(self) -> CodePointInversionList<'static> {
295         CodePointInversionList {
296             inv_list: self.inv_list.into_owned(),
297             size: self.size,
298         }
299     }
300 
301     /// Returns an owned inversion list representing the current [`CodePointInversionList`]
302     #[cfg(feature = "alloc")]
get_inversion_list_vec(&self) -> Vec<u32>303     pub fn get_inversion_list_vec(&self) -> Vec<u32> {
304         self.as_inversion_list().iter().map(u32::from).collect()
305     }
306 
307     /// Returns [`CodePointInversionList`] spanning entire Unicode range
308     ///
309     /// The range spans from `0x0 -> 0x10FFFF` inclusive.
310     ///
311     /// # Examples
312     ///
313     /// ```
314     /// use icu::collections::codepointinvlist::CodePointInversionList;
315     ///
316     /// let expected = [0x0, (char::MAX as u32) + 1];
317     /// assert_eq!(
318     ///     CodePointInversionList::all().get_inversion_list_vec(),
319     ///     expected
320     /// );
321     /// assert_eq!(
322     ///     CodePointInversionList::all().size(),
323     ///     (expected[1] - expected[0]) as usize
324     /// );
325     /// ```
all() -> Self326     pub fn all() -> Self {
327         Self {
328             inv_list: ALL_VEC,
329             size: (char::MAX as u32) + 1,
330         }
331     }
332 
333     /// Returns [`CodePointInversionList`] spanning BMP range
334     ///
335     /// The range spans from `0x0 -> 0xFFFF` inclusive.
336     ///
337     /// # Examples
338     ///
339     /// ```
340     /// use icu::collections::codepointinvlist::CodePointInversionList;
341     ///
342     /// const BMP_MAX: u32 = 0xFFFF;
343     ///
344     /// let expected = [0x0, BMP_MAX + 1];
345     /// assert_eq!(
346     ///     CodePointInversionList::bmp().get_inversion_list_vec(),
347     ///     expected
348     /// );
349     /// assert_eq!(
350     ///     CodePointInversionList::bmp().size(),
351     ///     (expected[1] - expected[0]) as usize
352     /// );
353     /// ```
bmp() -> Self354     pub fn bmp() -> Self {
355         Self {
356             inv_list: BMP_INV_LIST_VEC,
357             size: BMP_MAX + 1,
358         }
359     }
360 
361     /// Returns the inversion list as a slice
362     ///
363     /// Public only to the crate, not exposed to public
364     #[cfg(feature = "alloc")]
as_inversion_list(&self) -> &ZeroVec<PotentialCodePoint>365     pub(crate) fn as_inversion_list(&self) -> &ZeroVec<PotentialCodePoint> {
366         &self.inv_list
367     }
368 
369     /// Yields an [`Iterator`] going through the character set in the [`CodePointInversionList`]
370     ///
371     /// # Examples
372     ///
373     /// ```
374     /// use icu::collections::codepointinvlist::CodePointInversionList;
375     /// let example_list = [0x41, 0x44, 0x45, 0x46];
376     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
377     ///     &example_list,
378     /// )
379     /// .unwrap();
380     /// let mut ex_iter_chars = example.iter_chars();
381     /// assert_eq!(Some('A'), ex_iter_chars.next());
382     /// assert_eq!(Some('B'), ex_iter_chars.next());
383     /// assert_eq!(Some('C'), ex_iter_chars.next());
384     /// assert_eq!(Some('E'), ex_iter_chars.next());
385     /// assert_eq!(None, ex_iter_chars.next());
386     /// ```
iter_chars(&self) -> impl Iterator<Item = char> + '_387     pub fn iter_chars(&self) -> impl Iterator<Item = char> + '_ {
388         #[allow(clippy::indexing_slicing)] // chunks
389         self.inv_list
390             .as_ule_slice()
391             .chunks(2)
392             .flat_map(|pair| {
393                 u32::from(PotentialCodePoint::from_unaligned(pair[0]))
394                     ..u32::from(PotentialCodePoint::from_unaligned(pair[1]))
395             })
396             .filter_map(char::from_u32)
397     }
398 
399     /// Yields an [`Iterator`] returning the ranges of the code points that are
400     /// included in the [`CodePointInversionList`]
401     ///
402     /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
403     /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
404     /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
405     ///
406     /// # Example
407     ///
408     /// ```
409     /// use icu::collections::codepointinvlist::CodePointInversionList;
410     /// let example_list = [0x41, 0x44, 0x45, 0x46];
411     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
412     ///     &example_list,
413     /// )
414     /// .unwrap();
415     /// let mut example_iter_ranges = example.iter_ranges();
416     /// assert_eq!(Some(0x41..=0x43), example_iter_ranges.next());
417     /// assert_eq!(Some(0x45..=0x45), example_iter_ranges.next());
418     /// assert_eq!(None, example_iter_ranges.next());
419     /// ```
iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_420     pub fn iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_ {
421         #[allow(clippy::indexing_slicing)] // chunks
422         self.inv_list.as_ule_slice().chunks(2).map(|pair| {
423             let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0]));
424             let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1]));
425             range_start..=(range_limit - 1)
426         })
427     }
428 
429     /// Yields an [`Iterator`] returning the ranges of the code points that are
430     /// *not* included in the [`CodePointInversionList`]
431     ///
432     /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
433     /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
434     /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
435     ///
436     /// # Example
437     ///
438     /// ```
439     /// use icu::collections::codepointinvlist::CodePointInversionList;
440     /// let example_list = [0x41, 0x44, 0x45, 0x46];
441     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
442     ///     &example_list,
443     /// )
444     /// .unwrap();
445     /// let mut example_iter_ranges = example.iter_ranges_complemented();
446     /// assert_eq!(Some(0..=0x40), example_iter_ranges.next());
447     /// assert_eq!(Some(0x44..=0x44), example_iter_ranges.next());
448     /// assert_eq!(Some(0x46..=char::MAX as u32), example_iter_ranges.next());
449     /// assert_eq!(None, example_iter_ranges.next());
450     /// ```
iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_451     pub fn iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
452         let inv_ule = self.inv_list.as_ule_slice();
453         let middle = inv_ule.get(1..inv_ule.len() - 1).unwrap_or(&[]);
454         let beginning = if let Some(first) = self.inv_list.first() {
455             let first = u32::from(first);
456             if first == 0 {
457                 None
458             } else {
459                 Some(0..=first - 1)
460             }
461         } else {
462             None
463         };
464         let end = if let Some(last) = self.inv_list.last() {
465             let last = u32::from(last);
466             if last == char::MAX as u32 {
467                 None
468             } else {
469                 Some(last..=char::MAX as u32)
470             }
471         } else {
472             None
473         };
474         #[allow(clippy::indexing_slicing)] // chunks
475         let chunks = middle.chunks(2).map(|pair| {
476             let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0]));
477             let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1]));
478             range_start..=(range_limit - 1)
479         });
480         beginning.into_iter().chain(chunks).chain(end)
481     }
482 
483     /// Returns the number of ranges contained in this [`CodePointInversionList`]
get_range_count(&self) -> usize484     pub fn get_range_count(&self) -> usize {
485         self.inv_list.len() / 2
486     }
487 
488     /// Returns a specific range contained in this [`CodePointInversionList`] by index.
489     /// Intended for use in FFI.
get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>>490     pub fn get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>> {
491         let start_idx = idx * 2;
492         let end_idx = start_idx + 1;
493         let start = u32::from(self.inv_list.get(start_idx)?);
494         let end = u32::from(self.inv_list.get(end_idx)?);
495         Some(start..=(end - 1))
496     }
497 
498     /// Returns the number of elements of the [`CodePointInversionList`]
size(&self) -> usize499     pub fn size(&self) -> usize {
500         if self.is_empty() {
501             return 0;
502         }
503         self.size as usize
504     }
505 
506     /// Returns whether or not the [`CodePointInversionList`] is empty
is_empty(&self) -> bool507     pub fn is_empty(&self) -> bool {
508         self.inv_list.is_empty()
509     }
510 
511     /// Wrapper for contains
512     ///
513     /// Returns an [`Option`] as to whether or not it is possible for the query to be contained.
514     /// The value in the [`Option`] is the start index of the range that contains the query.
contains_query(&self, query: u32) -> Option<usize>515     fn contains_query(&self, query: u32) -> Option<usize> {
516         let query = PotentialCodePoint::try_from(query).ok()?;
517         match self.inv_list.binary_search(&query) {
518             Ok(pos) => {
519                 if pos % 2 == 0 {
520                     Some(pos)
521                 } else {
522                     None
523                 }
524             }
525             Err(pos) => {
526                 if pos % 2 != 0 && pos < self.inv_list.len() {
527                     Some(pos - 1)
528                 } else {
529                     None
530                 }
531             }
532         }
533     }
534 
535     /// Checks to see the query is in the [`CodePointInversionList`]
536     ///
537     /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
538     /// in the set using [`core`] implementation
539     ///
540     /// # Examples
541     ///
542     /// ```
543     /// use icu::collections::codepointinvlist::CodePointInversionList;
544     /// let example_list = [0x41, 0x43, 0x44, 0x45];
545     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
546     ///     &example_list,
547     /// )
548     /// .unwrap();
549     /// assert!(example.contains('A'));
550     /// assert!(!example.contains('C'));
551     /// ```
contains(&self, query: char) -> bool552     pub fn contains(&self, query: char) -> bool {
553         self.contains_query(query as u32).is_some()
554     }
555 
556     /// Checks to see the unsigned int is in the [`CodePointInversionList::all()`](CodePointInversionList::all())
557     ///
558     /// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte
559     /// values, there is an important difference. A [`u32`] can take values up to
560     /// a very large integer value, while a [`prim@char`] in Rust is defined to be in
561     /// the range from 0 to the maximum valid Unicode Scalar Value.
562     ///
563     /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
564     /// in the set using [`core`] implementation
565     ///
566     /// # Examples
567     ///
568     /// ```
569     /// use icu::collections::codepointinvlist::CodePointInversionList;
570     /// let example_list = [0x41, 0x43, 0x44, 0x45];
571     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
572     ///     &example_list,
573     /// )
574     /// .unwrap();
575     /// assert!(example.contains32(0x41));
576     /// assert!(!example.contains32(0x43));
577     /// ```
contains32(&self, query: u32) -> bool578     pub fn contains32(&self, query: u32) -> bool {
579         self.contains_query(query).is_some()
580     }
581 
582     /// Checks to see if the range is in the [`CodePointInversionList`]
583     ///
584     /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
585     /// in the set using [`Vec`] implementation. Only runs the search once on the `start`
586     /// parameter, while the `end` parameter is checked in a single `O(1)` step.
587     ///
588     /// # Examples
589     ///
590     /// ```
591     /// use icu::collections::codepointinvlist::CodePointInversionList;
592     /// let example_list = [0x41, 0x43, 0x44, 0x45];
593     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
594     ///     &example_list,
595     /// )
596     /// .unwrap();
597     /// assert!(example.contains_range('A'..'C'));
598     /// assert!(example.contains_range('A'..='B'));
599     /// assert!(!example.contains_range('A'..='C'));
600     /// ```
601     ///
602     /// Surrogate points (`0xD800 -> 0xDFFF`) will return [`false`] if the Range contains them but the
603     /// [`CodePointInversionList`] does not.
604     ///
605     /// Note: when comparing to ICU4C/J, keep in mind that `Range`s in Rust are
606     /// constructed inclusive of start boundary and exclusive of end boundary.
607     /// The ICU4C/J `CodePointInversionList::contains(UChar32 start, UChar32 end)` method
608     /// differs by including the end boundary.
609     ///
610     /// # Examples
611     ///
612     /// ```
613     /// use icu::collections::codepointinvlist::CodePointInversionList;
614     /// use std::char;
615     /// let check =
616     ///     char::from_u32(0xD7FE).unwrap()..char::from_u32(0xE001).unwrap();
617     /// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001];
618     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
619     ///     &example_list,
620     /// )
621     /// .unwrap();
622     /// assert!(!example.contains_range(check));
623     /// ```
contains_range(&self, range: impl RangeBounds<char>) -> bool624     pub fn contains_range(&self, range: impl RangeBounds<char>) -> bool {
625         let (from, till) = deconstruct_range(range);
626         if from >= till {
627             return false;
628         }
629         match self.contains_query(from) {
630             Some(pos) => {
631                 if let Some(x) = self.inv_list.get(pos + 1) {
632                     (till) <= x.into()
633                 } else {
634                     debug_assert!(
635                         false,
636                         "Inversion list query should not return out of bounds index"
637                     );
638                     false
639                 }
640             }
641             None => false,
642         }
643     }
644 
645     /// Check if the calling [`CodePointInversionList`] contains all the characters of the given [`CodePointInversionList`]
646     ///
647     /// # Examples
648     ///
649     /// ```
650     /// use icu::collections::codepointinvlist::CodePointInversionList;
651     /// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z
652     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
653     ///     &example_list,
654     /// )
655     /// .unwrap();
656     /// let a_to_d = CodePointInversionList::try_from_u32_inversion_list_slice(&[
657     ///     0x41, 0x45,
658     /// ])
659     /// .unwrap();
660     /// let f_to_t = CodePointInversionList::try_from_u32_inversion_list_slice(&[
661     ///     0x46, 0x55,
662     /// ])
663     /// .unwrap();
664     /// let r_to_x = CodePointInversionList::try_from_u32_inversion_list_slice(&[
665     ///     0x52, 0x58,
666     /// ])
667     /// .unwrap();
668     /// assert!(example.contains_set(&a_to_d)); // contains all
669     /// assert!(!example.contains_set(&f_to_t)); // contains none
670     /// assert!(!example.contains_set(&r_to_x)); // contains some
671     /// ```
contains_set(&self, set: &Self) -> bool672     pub fn contains_set(&self, set: &Self) -> bool {
673         if set.size() > self.size() {
674             return false;
675         }
676 
677         let mut set_ranges = set.iter_ranges();
678         let mut check_elem = set_ranges.next();
679 
680         let ranges = self.iter_ranges();
681         for range in ranges {
682             match check_elem {
683                 Some(ref check_range) => {
684                     if check_range.start() >= range.start()
685                         && check_range.end() <= &(range.end() + 1)
686                     {
687                         check_elem = set_ranges.next();
688                     }
689                 }
690                 _ => break,
691             }
692         }
693         check_elem.is_none()
694     }
695 
696     /// Returns the end of the initial substring where the characters are either contained/not contained
697     /// in the set.
698     ///
699     /// # Examples
700     ///
701     /// ```
702     /// use icu::collections::codepointinvlist::CodePointInversionList;
703     /// let example_list = [0x41, 0x44]; // {A, B, C}
704     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
705     ///     &example_list,
706     /// )
707     /// .unwrap();
708     /// assert_eq!(example.span("CABXYZ", true), 3);
709     /// assert_eq!(example.span("XYZC", false), 3);
710     /// assert_eq!(example.span("XYZ", true), 0);
711     /// assert_eq!(example.span("ABC", false), 0);
712     /// ```
span(&self, span_str: &str, contained: bool) -> usize713     pub fn span(&self, span_str: &str, contained: bool) -> usize {
714         span_str
715             .chars()
716             .take_while(|&x| self.contains(x) == contained)
717             .count()
718     }
719 
720     /// Returns the start of the trailing substring (starting from end of string) where the characters are
721     /// either contained/not contained in the set. Returns the length of the string if no valid return.
722     ///
723     /// # Examples
724     ///
725     /// ```
726     /// use icu::collections::codepointinvlist::CodePointInversionList;
727     /// let example_list = [0x41, 0x44]; // {A, B, C}
728     /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
729     ///     &example_list,
730     /// )
731     /// .unwrap();
732     /// assert_eq!(example.span_back("XYZCAB", true), 3);
733     /// assert_eq!(example.span_back("ABCXYZ", true), 6);
734     /// assert_eq!(example.span_back("CABXYZ", false), 3);
735     /// ```
span_back(&self, span_str: &str, contained: bool) -> usize736     pub fn span_back(&self, span_str: &str, contained: bool) -> usize {
737         span_str.len()
738             - span_str
739                 .chars()
740                 .rev()
741                 .take_while(|&x| self.contains(x) == contained)
742                 .count()
743     }
744 }
745 
746 #[cfg(test)]
747 mod tests {
748     use super::{CodePointInversionList, InvalidSetError};
749     use std::{char, vec::Vec};
750     use zerovec::ZeroVec;
751 
752     #[test]
test_codepointinversionlist_try_from_vec()753     fn test_codepointinversionlist_try_from_vec() {
754         let ex = vec![0x2, 0x3, 0x4, 0x5];
755         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
756         assert_eq!(ex, check.get_inversion_list_vec());
757         assert_eq!(2, check.size());
758     }
759 
760     #[test]
test_codepointinversionlist_try_from_vec_error()761     fn test_codepointinversionlist_try_from_vec_error() {
762         let check = vec![0x1, 0x1, 0x2, 0x3, 0x4];
763         let set = CodePointInversionList::try_from_u32_inversion_list_slice(&check);
764         assert!(matches!(set, Err(InvalidSetError(_))));
765         if let Err(InvalidSetError(actual)) = set {
766             assert_eq!(
767                 &check,
768                 &actual.into_iter().map(u32::from).collect::<Vec<_>>()
769             );
770         }
771     }
772 
773     // CodePointInversionList membership functions
774     #[test]
test_codepointinversionlist_contains_query()775     fn test_codepointinversionlist_contains_query() {
776         let ex = vec![0x41, 0x46, 0x4B, 0x55];
777         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
778         assert!(check.contains_query(0x40).is_none());
779         assert_eq!(check.contains_query(0x41).unwrap(), 0);
780         assert_eq!(check.contains_query(0x44).unwrap(), 0);
781         assert!(check.contains_query(0x46).is_none());
782         assert_eq!(check.contains_query(0x4C).unwrap(), 2);
783         assert!(check.contains_query(0x56).is_none());
784     }
785 
786     #[test]
test_codepointinversionlist_contains()787     fn test_codepointinversionlist_contains() {
788         let ex = vec![0x2, 0x5, 0xA, 0xF];
789         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
790         assert!(check.contains(0x2 as char));
791         assert!(check.contains(0x4 as char));
792         assert!(check.contains(0xA as char));
793         assert!(check.contains(0xE as char));
794     }
795 
796     #[test]
test_codepointinversionlist_contains_false()797     fn test_codepointinversionlist_contains_false() {
798         let ex = vec![0x2, 0x5, 0xA, 0xF];
799         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
800         assert!(!check.contains(0x1 as char));
801         assert!(!check.contains(0x5 as char));
802         assert!(!check.contains(0x9 as char));
803         assert!(!check.contains(0xF as char));
804         assert!(!check.contains(0x10 as char));
805     }
806 
807     #[test]
test_codepointinversionlist_contains_range()808     fn test_codepointinversionlist_contains_range() {
809         let ex = vec![0x41, 0x46, 0x4B, 0x55];
810         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
811         assert!(check.contains_range('A'..='E')); // 65 - 69
812         assert!(check.contains_range('C'..'D')); // 67 - 67
813         assert!(check.contains_range('L'..'P')); // 76 - 80
814         assert!(!check.contains_range('L'..='U')); // 76 - 85
815     }
816 
817     #[test]
test_codepointinversionlist_contains_range_false()818     fn test_codepointinversionlist_contains_range_false() {
819         let ex = vec![0x41, 0x46, 0x4B, 0x55];
820         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
821         assert!(!check.contains_range('!'..'A')); // 33 - 65
822         assert!(!check.contains_range('F'..'K')); // 70 - 74
823         assert!(!check.contains_range('U'..)); // 85 - ..
824     }
825 
826     #[test]
test_codepointinversionlist_contains_range_invalid()827     fn test_codepointinversionlist_contains_range_invalid() {
828         let check = CodePointInversionList::all();
829         assert!(!check.contains_range('A'..'!')); // 65 - 33
830         assert!(!check.contains_range('A'..'A')); // 65 - 65
831     }
832 
833     #[test]
test_codepointinversionlist_contains_set_u()834     fn test_codepointinversionlist_contains_set_u() {
835         let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x6E];
836         let u = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
837         let inside = vec![0xF, 0x14, 0x2C, 0x31, 0x46, 0x50, 0x64, 0x6D];
838         let s = CodePointInversionList::try_from_u32_inversion_list_slice(&inside).unwrap();
839         assert!(u.contains_set(&s));
840     }
841 
842     #[test]
test_codepointinversionlist_contains_set_u_false()843     fn test_codepointinversionlist_contains_set_u_false() {
844         let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x78];
845         let u = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
846         let outside = vec![0x0, 0xA, 0x16, 0x2C, 0x32, 0x46, 0x4F, 0x51, 0x6D, 0x6F];
847         let s = CodePointInversionList::try_from_u32_inversion_list_slice(&outside).unwrap();
848         assert!(!u.contains_set(&s));
849     }
850 
851     #[test]
test_codepointinversionlist_size()852     fn test_codepointinversionlist_size() {
853         let ex = vec![0x2, 0x5, 0xA, 0xF];
854         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
855         assert_eq!(8, check.size());
856         let check = CodePointInversionList::all();
857         let expected = (char::MAX as u32) + 1;
858         assert_eq!(expected as usize, check.size());
859         let inv_list_vec = vec![];
860         let check = CodePointInversionList {
861             inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
862             size: 0,
863         };
864         assert_eq!(check.size(), 0);
865     }
866 
867     #[test]
test_codepointinversionlist_is_empty()868     fn test_codepointinversionlist_is_empty() {
869         let inv_list_vec = vec![];
870         let check = CodePointInversionList {
871             inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
872             size: 0,
873         };
874         assert!(check.is_empty());
875     }
876 
877     #[test]
test_codepointinversionlist_is_not_empty()878     fn test_codepointinversionlist_is_not_empty() {
879         let check = CodePointInversionList::all();
880         assert!(!check.is_empty());
881     }
882 
883     #[test]
test_codepointinversionlist_iter_chars()884     fn test_codepointinversionlist_iter_chars() {
885         let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
886         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
887         let mut iter = check.iter_chars();
888         assert_eq!(Some('A'), iter.next());
889         assert_eq!(Some('B'), iter.next());
890         assert_eq!(Some('C'), iter.next());
891         assert_eq!(Some('E'), iter.next());
892         assert_eq!(None, iter.next());
893     }
894 
895     #[test]
test_codepointinversionlist_iter_ranges()896     fn test_codepointinversionlist_iter_ranges() {
897         let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
898         let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
899         let mut ranges = set.iter_ranges();
900         assert_eq!(Some(0x41..=0x43), ranges.next());
901         assert_eq!(Some(0x45..=0x45), ranges.next());
902         assert_eq!(Some(0xD800..=0xD800), ranges.next());
903         assert_eq!(None, ranges.next());
904     }
905 
906     #[test]
test_codepointinversionlist_iter_ranges_exactsizeiter_trait()907     fn test_codepointinversionlist_iter_ranges_exactsizeiter_trait() {
908         let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
909         let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
910         let ranges = set.iter_ranges();
911         assert_eq!(3, ranges.len());
912     }
913 
914     #[test]
test_codepointinversionlist_range_count()915     fn test_codepointinversionlist_range_count() {
916         let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
917         let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
918         assert_eq!(3, set.get_range_count());
919     }
920 
921     #[test]
test_codepointinversionlist_get_nth_range()922     fn test_codepointinversionlist_get_nth_range() {
923         let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
924         let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
925         assert_eq!(Some(0x41..=0x43), set.get_nth_range(0));
926         assert_eq!(Some(0x45..=0x45), set.get_nth_range(1));
927         assert_eq!(Some(0xD800..=0xD800), set.get_nth_range(2));
928         assert_eq!(None, set.get_nth_range(3));
929     }
930 
931     // Range<char> cannot represent the upper bound (non-inclusive) for
932     // char::MAX, whereas Range<u32> can.
933     #[test]
test_codepointinversionlist_iter_ranges_with_max_code_point()934     fn test_codepointinversionlist_iter_ranges_with_max_code_point() {
935         let ex = vec![0x80, (char::MAX as u32) + 1];
936         let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
937         let mut ranges = set.iter_ranges();
938         assert_eq!(Some(0x80..=(char::MAX as u32)), ranges.next());
939         assert_eq!(None, ranges.next());
940     }
941 
942     #[test]
test_codepointinversionlist_span_contains()943     fn test_codepointinversionlist_span_contains() {
944         let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
945         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
946         assert_eq!(check.span("ABCDE", true), 3);
947         assert_eq!(check.span("E", true), 0);
948     }
949 
950     #[test]
test_codepointinversionlist_span_does_not_contain()951     fn test_codepointinversionlist_span_does_not_contain() {
952         let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
953         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
954         assert_eq!(check.span("DEF", false), 2);
955         assert_eq!(check.span("KLMA", false), 3);
956     }
957 
958     #[test]
test_codepointinversionlist_span_back_contains()959     fn test_codepointinversionlist_span_back_contains() {
960         let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
961         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
962         assert_eq!(check.span_back("XYZABFH", true), 3);
963         assert_eq!(check.span_back("ABCXYZ", true), 6);
964     }
965 
966     #[test]
test_codepointinversionlist_span_back_does_not_contain()967     fn test_codepointinversionlist_span_back_does_not_contain() {
968         let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
969         let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
970         assert_eq!(check.span_back("ABCXYZ", false), 3);
971         assert_eq!(check.span_back("XYZABC", false), 6);
972     }
973 
974     #[test]
test_uniset_to_inv_list()975     fn test_uniset_to_inv_list() {
976         let inv_list = [
977             0x9, 0xE, 0x20, 0x21, 0x85, 0x86, 0xA0, 0xA1, 0x1626, 0x1627, 0x2000, 0x2003, 0x2028,
978             0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
979         ];
980         let s: CodePointInversionList =
981             CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap();
982         let round_trip_inv_list = s.get_inversion_list_vec();
983         assert_eq!(
984             round_trip_inv_list.into_iter().collect::<ZeroVec<u32>>(),
985             inv_list
986         );
987     }
988 
989     #[test]
test_serde_serialize()990     fn test_serde_serialize() {
991         let inv_list = [0x41, 0x46, 0x4B, 0x55];
992         let uniset = CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap();
993         let json_str = serde_json::to_string(&uniset).unwrap();
994         assert_eq!(json_str, r#"["A-E","K-T"]"#);
995     }
996 
997     #[test]
test_serde_serialize_surrogates()998     fn test_serde_serialize_surrogates() {
999         let inv_list = [0xDFAB, 0xDFFF];
1000         let uniset = CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap();
1001         let json_str = serde_json::to_string(&uniset).unwrap();
1002         assert_eq!(json_str, r#"["U+DFAB-U+DFFE"]"#);
1003     }
1004 
1005     #[test]
test_serde_deserialize()1006     fn test_serde_deserialize() {
1007         let inv_list_str = r#"["A-E","K-T"]"#;
1008         let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
1009         let exp_uniset =
1010             CodePointInversionList::try_from_u32_inversion_list_slice(&exp_inv_list).unwrap();
1011         let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1012         assert_eq!(act_uniset, exp_uniset);
1013     }
1014 
1015     #[test]
test_serde_deserialize_surrogates()1016     fn test_serde_deserialize_surrogates() {
1017         let inv_list_str = r#"["U+DFAB-U+DFFE"]"#;
1018         let exp_inv_list = [0xDFAB, 0xDFFF];
1019         let exp_uniset =
1020             CodePointInversionList::try_from_u32_inversion_list_slice(&exp_inv_list).unwrap();
1021         let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1022         assert_eq!(act_uniset, exp_uniset);
1023     }
1024 
1025     #[test]
test_serde_deserialize_invalid()1026     fn test_serde_deserialize_invalid() {
1027         assert!(serde_json::from_str::<CodePointInversionList>("[65,70,98775,85]").is_err());
1028         assert!(serde_json::from_str::<CodePointInversionList>("[65,70,U+FFFFFFFFFF,85]").is_err());
1029     }
1030 
1031     #[test]
test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error>1032     fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1033         let set = CodePointInversionList::bmp();
1034         let set_serialized: Vec<u8> = postcard::to_allocvec(&set).unwrap();
1035         let set_deserialized: CodePointInversionList =
1036             postcard::from_bytes::<CodePointInversionList>(&set_serialized)?;
1037 
1038         assert_eq!(&set, &set_deserialized);
1039         assert!(!set_deserialized.inv_list.is_owned());
1040 
1041         Ok(())
1042     }
1043 
1044     #[test]
databake()1045     fn databake() {
1046         databake::test_bake!(
1047             CodePointInversionList<'static>,
1048             const,
1049             unsafe {
1050                 #[allow(unused_unsafe)]
1051                 crate::codepointinvlist::CodePointInversionList::from_parts_unchecked(
1052                     unsafe {
1053                         zerovec::ZeroVec::from_bytes_unchecked(
1054                             b"0\0\0\0:\0\0\0A\0\0\0G\0\0\0a\0\0\0g\0\0\0",
1055                         )
1056                     },
1057                     22u32,
1058                 )
1059             },
1060             icu_collections,
1061             [zerovec],
1062         );
1063     }
1064 }
1065