• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! This module provides functionality for querying of sets of Unicode code points and strings.
6 //!
7 //! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while
8 //! it also maintains a list of strings in the set.
9 //!
10 //! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
11 
12 #[cfg(feature = "alloc")]
13 use crate::codepointinvlist::CodePointInversionListBuilder;
14 use crate::codepointinvlist::{CodePointInversionList, CodePointInversionListULE};
15 #[cfg(feature = "alloc")]
16 use alloc::string::{String, ToString};
17 #[cfg(feature = "alloc")]
18 use alloc::vec::Vec;
19 use displaydoc::Display;
20 use yoke::Yokeable;
21 use zerofrom::ZeroFrom;
22 use zerovec::{VarZeroSlice, VarZeroVec};
23 
24 /// A data structure providing a concrete implementation of a set of code points and strings,
25 /// using an inversion list for the code points.
26 ///
27 /// This is what ICU4C calls a `UnicodeSet`.
28 #[zerovec::make_varule(CodePointInversionListAndStringListULE)]
29 #[zerovec::skip_derive(Ord)]
30 #[zerovec::derive(Debug)]
31 #[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
32 #[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))]
33 // Valid to auto-derive Deserialize because the invariants are weakly held
34 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
35 #[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))]
36 pub struct CodePointInversionListAndStringList<'data> {
37     #[cfg_attr(feature = "serde", serde(borrow))]
38     #[zerovec::varule(CodePointInversionListULE)]
39     cp_inv_list: CodePointInversionList<'data>,
40     // Invariants (weakly held):
41     //   - no input string is length 1 (a length 1 string should be a single code point)
42     //   - the string list is sorted
43     //   - the elements in the string list are unique
44     #[cfg_attr(feature = "serde", serde(borrow))]
45     str_list: VarZeroVec<'data, str>,
46 }
47 
48 #[cfg(feature = "databake")]
49 impl databake::Bake for CodePointInversionListAndStringList<'_> {
bake(&self, env: &databake::CrateEnv) -> databake::TokenStream50     fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
51         env.insert("icu_collections");
52         let cp_inv_list = self.cp_inv_list.bake(env);
53         let str_list = self.str_list.bake(env);
54         // Safe because our parts are safe.
55         databake::quote! {
56             icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list)
57         }
58     }
59 }
60 
61 #[cfg(feature = "databake")]
62 impl databake::BakeSize for CodePointInversionListAndStringList<'_> {
borrows_size(&self) -> usize63     fn borrows_size(&self) -> usize {
64         self.cp_inv_list.borrows_size() + self.str_list.borrows_size()
65     }
66 }
67 
68 impl<'data> CodePointInversionListAndStringList<'data> {
69     /// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the
70     /// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings.
try_from( cp_inv_list: CodePointInversionList<'data>, str_list: VarZeroVec<'data, str>, ) -> Result<Self, InvalidStringList>71     pub fn try_from(
72         cp_inv_list: CodePointInversionList<'data>,
73         str_list: VarZeroVec<'data, str>,
74     ) -> Result<Self, InvalidStringList> {
75         // Verify invariants:
76         // Do so by using the equivalent of str_list.iter().windows(2) to get
77         // overlapping windows of size 2. The above putative code is not possible
78         // because `.windows()` exists on a slice, but VarZeroVec cannot return a slice
79         // because the non-fixed size elements necessitate at least some type
80         // of allocation.
81         {
82             let mut it = str_list.iter();
83             if let Some(mut x) = it.next() {
84                 if x.len() == 1 {
85                     return Err(InvalidStringList::InvalidStringLength(
86                         #[cfg(feature = "alloc")]
87                         x.to_string(),
88                     ));
89                 }
90                 for y in it {
91                     if x.len() == 1 {
92                         return Err(InvalidStringList::InvalidStringLength(
93                             #[cfg(feature = "alloc")]
94                             x.to_string(),
95                         ));
96                     } else if x == y {
97                         return Err(InvalidStringList::StringListNotUnique(
98                             #[cfg(feature = "alloc")]
99                             x.to_string(),
100                         ));
101                     } else if x > y {
102                         return Err(InvalidStringList::StringListNotSorted(
103                             #[cfg(feature = "alloc")]
104                             x.to_string(),
105                             #[cfg(feature = "alloc")]
106                             y.to_string(),
107                         ));
108                     }
109 
110                     // Next window begins. Update `x` here, `y` will be updated in next loop iteration.
111                     x = y;
112                 }
113             }
114         }
115 
116         Ok(CodePointInversionListAndStringList {
117             cp_inv_list,
118             str_list,
119         })
120     }
121 
122     #[doc(hidden)] // databake internal
from_parts_unchecked( cp_inv_list: CodePointInversionList<'data>, str_list: VarZeroVec<'data, str>, ) -> Self123     pub const fn from_parts_unchecked(
124         cp_inv_list: CodePointInversionList<'data>,
125         str_list: VarZeroVec<'data, str>,
126     ) -> Self {
127         CodePointInversionListAndStringList {
128             cp_inv_list,
129             str_list,
130         }
131     }
132 
133     /// Returns the number of elements in this set (its cardinality).
134     /// Note than the elements of a set may include both individual
135     /// codepoints and strings.
size(&self) -> usize136     pub fn size(&self) -> usize {
137         self.cp_inv_list.size() + self.str_list.len()
138     }
139 
140     /// Return true if this set contains multi-code point strings or the empty string.
has_strings(&self) -> bool141     pub fn has_strings(&self) -> bool {
142         !self.str_list.is_empty()
143     }
144 
145     ///
146     /// # Examples
147     /// ```
148     /// use icu::collections::codepointinvlist::CodePointInversionList;
149     /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
150     /// use zerovec::VarZeroVec;
151     ///
152     /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
153     /// let cp_list =
154     ///    CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
155     /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
156     /// let str_list = VarZeroVec::<str>::from(str_slice);
157     ///
158     /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
159     ///
160     /// assert!(cpilsl.contains_str("bmp_max"));
161     /// assert!(cpilsl.contains_str(""));
162     /// assert!(cpilsl.contains_str("A"));
163     /// assert!(cpilsl.contains_str("ቔ"));  // U+1254 ETHIOPIC SYLLABLE QHEE
164     /// assert!(!cpilsl.contains_str("bazinga!"));
165     /// ```
contains_str(&self, s: &str) -> bool166     pub fn contains_str(&self, s: &str) -> bool {
167         let mut chars = s.chars();
168         if let Some(first_char) = chars.next() {
169             if chars.next().is_none() {
170                 return self.contains(first_char);
171             }
172         }
173         self.str_list.binary_search(s).is_ok()
174     }
175 
176     ///
177     /// # Examples
178     /// ```
179     /// use icu::collections::codepointinvlist::CodePointInversionList;
180     /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
181     /// use zerovec::VarZeroVec;
182     ///
183     /// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
184     /// let cp_list =
185     ///     CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
186     /// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
187     /// let str_list = VarZeroVec::<str>::from(str_slice);
188     ///
189     /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
190     ///
191     /// assert!(cpilsl.contains32(0));
192     /// assert!(cpilsl.contains32(0x0042));
193     /// assert!(!cpilsl.contains32(0x0080));
194     /// ```
contains32(&self, cp: u32) -> bool195     pub fn contains32(&self, cp: u32) -> bool {
196         self.cp_inv_list.contains32(cp)
197     }
198 
199     ///
200     /// # Examples
201     /// ```
202     /// use icu::collections::codepointinvlist::CodePointInversionList;
203     /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
204     /// use zerovec::VarZeroVec;
205     ///
206     /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
207     /// let cp_list =
208     ///    CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
209     /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
210     /// let str_list = VarZeroVec::<str>::from(str_slice);
211     ///
212     /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
213     ///
214     /// assert!(cpilsl.contains('A'));
215     /// assert!(cpilsl.contains('ቔ'));  // U+1254 ETHIOPIC SYLLABLE QHEE
216     /// assert!(!cpilsl.contains('\u{1_0000}'));
217     /// assert!(!cpilsl.contains('��'));  // U+1FA2B NEUTRAL CHESS TURNED QUEEN
contains(&self, ch: char) -> bool218     pub fn contains(&self, ch: char) -> bool {
219         self.contains32(ch as u32)
220     }
221 
222     /// Access the underlying [`CodePointInversionList`].
code_points(&self) -> &CodePointInversionList<'data>223     pub fn code_points(&self) -> &CodePointInversionList<'data> {
224         &self.cp_inv_list
225     }
226 
227     /// Access the contained strings.
strings(&self) -> &VarZeroSlice<str>228     pub fn strings(&self) -> &VarZeroSlice<str> {
229         &self.str_list
230     }
231 }
232 
233 #[cfg(feature = "alloc")]
234 impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> {
from_iter<I>(it: I) -> Self where I: IntoIterator<Item = &'a str>,235     fn from_iter<I>(it: I) -> Self
236     where
237         I: IntoIterator<Item = &'a str>,
238     {
239         let mut builder = CodePointInversionListBuilder::new();
240         let mut strings = Vec::<&str>::new();
241         for s in it {
242             let mut chars = s.chars();
243             if let Some(first_char) = chars.next() {
244                 if chars.next().is_none() {
245                     builder.add_char(first_char);
246                     continue;
247                 }
248             }
249             strings.push(s);
250         }
251 
252         // Ensure that the string list is sorted. If not, the binary search that
253         // is used for `.contains(&str)` will return garbage output.
254         strings.sort_unstable();
255         strings.dedup();
256 
257         let cp_inv_list = builder.build();
258         let str_list = VarZeroVec::<str>::from(&strings);
259 
260         CodePointInversionListAndStringList {
261             cp_inv_list,
262             str_list,
263         }
264     }
265 }
266 
267 /// Custom Errors for [`CodePointInversionListAndStringList`].
268 #[derive(Display, Debug)]
269 pub enum InvalidStringList {
270     /// A string in the string list had an invalid length
271     #[cfg_attr(feature = "alloc", displaydoc("Invalid string length for string: {0}"))]
272     InvalidStringLength(#[cfg(feature = "alloc")] String),
273     /// A string in the string list appears more than once
274     #[cfg_attr(feature = "alloc", displaydoc("String list has duplicate: {0}"))]
275     StringListNotUnique(#[cfg(feature = "alloc")] String),
276     /// Two strings in the string list compare to each other opposite of sorted order
277     #[cfg_attr(
278         feature = "alloc",
279         displaydoc("Strings in string list not in sorted order: ({0}, {1})")
280     )]
281     StringListNotSorted(
282         #[cfg(feature = "alloc")] String,
283         #[cfg(feature = "alloc")] String,
284     ),
285 }
286 
287 #[cfg(test)]
288 mod tests {
289     use super::*;
290 
291     #[test]
test_size_has_strings()292     fn test_size_has_strings() {
293         let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
294         let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
295         let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"];
296         let str_list = VarZeroVec::<str>::from(str_slice);
297 
298         let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
299 
300         assert!(cpilsl.has_strings());
301         assert_eq!(8, cpilsl.size());
302     }
303 
304     #[test]
test_empty_string_allowed()305     fn test_empty_string_allowed() {
306         let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
307         let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
308         let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
309         let str_list = VarZeroVec::<str>::from(str_slice);
310 
311         let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
312 
313         assert!(cpilsl.has_strings());
314         assert_eq!(9, cpilsl.size());
315     }
316 
317     #[test]
test_invalid_string()318     fn test_invalid_string() {
319         let cp_slice = &[0, 1];
320         let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
321         let str_slice = &["a"];
322         let str_list = VarZeroVec::<str>::from(str_slice);
323 
324         let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
325 
326         assert!(matches!(
327             cpilsl,
328             Err(InvalidStringList::InvalidStringLength(_))
329         ));
330     }
331 
332     #[test]
test_invalid_string_list_has_duplicate()333     fn test_invalid_string_list_has_duplicate() {
334         let cp_slice = &[0, 1];
335         let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
336         let str_slice = &["abc", "abc"];
337         let str_list = VarZeroVec::<str>::from(str_slice);
338 
339         let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
340 
341         assert!(matches!(
342             cpilsl,
343             Err(InvalidStringList::StringListNotUnique(_))
344         ));
345     }
346 
347     #[test]
test_invalid_string_list_not_sorted()348     fn test_invalid_string_list_not_sorted() {
349         let cp_slice = &[0, 1];
350         let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
351         let str_slice = &["xyz", "abc"];
352         let str_list = VarZeroVec::<str>::from(str_slice);
353 
354         let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
355 
356         assert!(matches!(
357             cpilsl,
358             Err(InvalidStringList::StringListNotSorted(_, _))
359         ));
360     }
361 
362     #[test]
test_from_iter_invariants()363     fn test_from_iter_invariants() {
364         let in_strs_1 = ["a", "abc", "xyz", "abc"];
365         let in_strs_2 = ["xyz", "abc", "a", "abc"];
366 
367         let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1);
368         let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2);
369 
370         assert_eq!(cpilsl_1, cpilsl_2);
371 
372         assert!(cpilsl_1.has_strings());
373         assert!(cpilsl_1.contains_str("abc"));
374         assert!(cpilsl_1.contains_str("xyz"));
375         assert!(!cpilsl_1.contains_str("def"));
376 
377         assert_eq!(1, cpilsl_1.cp_inv_list.size());
378         assert!(cpilsl_1.contains('a'));
379         assert!(!cpilsl_1.contains('0'));
380         assert!(!cpilsl_1.contains('q'));
381 
382         assert_eq!(3, cpilsl_1.size());
383     }
384 }
385