1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! This module provides functionality for querying of sets of Unicode code points and strings. 6 //! 7 //! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while 8 //! it also maintains a list of strings in the set. 9 //! 10 //! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html). 11 12 #[cfg(feature = "alloc")] 13 use crate::codepointinvlist::CodePointInversionListBuilder; 14 use crate::codepointinvlist::{CodePointInversionList, CodePointInversionListULE}; 15 #[cfg(feature = "alloc")] 16 use alloc::string::{String, ToString}; 17 #[cfg(feature = "alloc")] 18 use alloc::vec::Vec; 19 use displaydoc::Display; 20 use yoke::Yokeable; 21 use zerofrom::ZeroFrom; 22 use zerovec::{VarZeroSlice, VarZeroVec}; 23 24 /// A data structure providing a concrete implementation of a set of code points and strings, 25 /// using an inversion list for the code points. 26 /// 27 /// This is what ICU4C calls a `UnicodeSet`. 28 #[zerovec::make_varule(CodePointInversionListAndStringListULE)] 29 #[zerovec::skip_derive(Ord)] 30 #[zerovec::derive(Debug)] 31 #[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)] 32 #[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))] 33 // Valid to auto-derive Deserialize because the invariants are weakly held 34 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] 35 #[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))] 36 pub struct CodePointInversionListAndStringList<'data> { 37 #[cfg_attr(feature = "serde", serde(borrow))] 38 #[zerovec::varule(CodePointInversionListULE)] 39 cp_inv_list: CodePointInversionList<'data>, 40 // Invariants (weakly held): 41 // - no input string is length 1 (a length 1 string should be a single code point) 42 // - the string list is sorted 43 // - the elements in the string list are unique 44 #[cfg_attr(feature = "serde", serde(borrow))] 45 str_list: VarZeroVec<'data, str>, 46 } 47 48 #[cfg(feature = "databake")] 49 impl databake::Bake for CodePointInversionListAndStringList<'_> { bake(&self, env: &databake::CrateEnv) -> databake::TokenStream50 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { 51 env.insert("icu_collections"); 52 let cp_inv_list = self.cp_inv_list.bake(env); 53 let str_list = self.str_list.bake(env); 54 // Safe because our parts are safe. 55 databake::quote! { 56 icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list) 57 } 58 } 59 } 60 61 #[cfg(feature = "databake")] 62 impl databake::BakeSize for CodePointInversionListAndStringList<'_> { borrows_size(&self) -> usize63 fn borrows_size(&self) -> usize { 64 self.cp_inv_list.borrows_size() + self.str_list.borrows_size() 65 } 66 } 67 68 impl<'data> CodePointInversionListAndStringList<'data> { 69 /// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the 70 /// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings. try_from( cp_inv_list: CodePointInversionList<'data>, str_list: VarZeroVec<'data, str>, ) -> Result<Self, InvalidStringList>71 pub fn try_from( 72 cp_inv_list: CodePointInversionList<'data>, 73 str_list: VarZeroVec<'data, str>, 74 ) -> Result<Self, InvalidStringList> { 75 // Verify invariants: 76 // Do so by using the equivalent of str_list.iter().windows(2) to get 77 // overlapping windows of size 2. The above putative code is not possible 78 // because `.windows()` exists on a slice, but VarZeroVec cannot return a slice 79 // because the non-fixed size elements necessitate at least some type 80 // of allocation. 81 { 82 let mut it = str_list.iter(); 83 if let Some(mut x) = it.next() { 84 if x.len() == 1 { 85 return Err(InvalidStringList::InvalidStringLength( 86 #[cfg(feature = "alloc")] 87 x.to_string(), 88 )); 89 } 90 for y in it { 91 if x.len() == 1 { 92 return Err(InvalidStringList::InvalidStringLength( 93 #[cfg(feature = "alloc")] 94 x.to_string(), 95 )); 96 } else if x == y { 97 return Err(InvalidStringList::StringListNotUnique( 98 #[cfg(feature = "alloc")] 99 x.to_string(), 100 )); 101 } else if x > y { 102 return Err(InvalidStringList::StringListNotSorted( 103 #[cfg(feature = "alloc")] 104 x.to_string(), 105 #[cfg(feature = "alloc")] 106 y.to_string(), 107 )); 108 } 109 110 // Next window begins. Update `x` here, `y` will be updated in next loop iteration. 111 x = y; 112 } 113 } 114 } 115 116 Ok(CodePointInversionListAndStringList { 117 cp_inv_list, 118 str_list, 119 }) 120 } 121 122 #[doc(hidden)] // databake internal from_parts_unchecked( cp_inv_list: CodePointInversionList<'data>, str_list: VarZeroVec<'data, str>, ) -> Self123 pub const fn from_parts_unchecked( 124 cp_inv_list: CodePointInversionList<'data>, 125 str_list: VarZeroVec<'data, str>, 126 ) -> Self { 127 CodePointInversionListAndStringList { 128 cp_inv_list, 129 str_list, 130 } 131 } 132 133 /// Returns the number of elements in this set (its cardinality). 134 /// Note than the elements of a set may include both individual 135 /// codepoints and strings. size(&self) -> usize136 pub fn size(&self) -> usize { 137 self.cp_inv_list.size() + self.str_list.len() 138 } 139 140 /// Return true if this set contains multi-code point strings or the empty string. has_strings(&self) -> bool141 pub fn has_strings(&self) -> bool { 142 !self.str_list.is_empty() 143 } 144 145 /// 146 /// # Examples 147 /// ``` 148 /// use icu::collections::codepointinvlist::CodePointInversionList; 149 /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList; 150 /// use zerovec::VarZeroVec; 151 /// 152 /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000]; 153 /// let cp_list = 154 /// CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 155 /// let str_slice = &["", "bmp_max", "unicode_max", "zero"]; 156 /// let str_list = VarZeroVec::<str>::from(str_slice); 157 /// 158 /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap(); 159 /// 160 /// assert!(cpilsl.contains_str("bmp_max")); 161 /// assert!(cpilsl.contains_str("")); 162 /// assert!(cpilsl.contains_str("A")); 163 /// assert!(cpilsl.contains_str("ቔ")); // U+1254 ETHIOPIC SYLLABLE QHEE 164 /// assert!(!cpilsl.contains_str("bazinga!")); 165 /// ``` contains_str(&self, s: &str) -> bool166 pub fn contains_str(&self, s: &str) -> bool { 167 let mut chars = s.chars(); 168 if let Some(first_char) = chars.next() { 169 if chars.next().is_none() { 170 return self.contains(first_char); 171 } 172 } 173 self.str_list.binary_search(s).is_ok() 174 } 175 176 /// 177 /// # Examples 178 /// ``` 179 /// use icu::collections::codepointinvlist::CodePointInversionList; 180 /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList; 181 /// use zerovec::VarZeroVec; 182 /// 183 /// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000]; 184 /// let cp_list = 185 /// CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 186 /// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"]; 187 /// let str_list = VarZeroVec::<str>::from(str_slice); 188 /// 189 /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap(); 190 /// 191 /// assert!(cpilsl.contains32(0)); 192 /// assert!(cpilsl.contains32(0x0042)); 193 /// assert!(!cpilsl.contains32(0x0080)); 194 /// ``` contains32(&self, cp: u32) -> bool195 pub fn contains32(&self, cp: u32) -> bool { 196 self.cp_inv_list.contains32(cp) 197 } 198 199 /// 200 /// # Examples 201 /// ``` 202 /// use icu::collections::codepointinvlist::CodePointInversionList; 203 /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList; 204 /// use zerovec::VarZeroVec; 205 /// 206 /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000]; 207 /// let cp_list = 208 /// CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 209 /// let str_slice = &["", "bmp_max", "unicode_max", "zero"]; 210 /// let str_list = VarZeroVec::<str>::from(str_slice); 211 /// 212 /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap(); 213 /// 214 /// assert!(cpilsl.contains('A')); 215 /// assert!(cpilsl.contains('ቔ')); // U+1254 ETHIOPIC SYLLABLE QHEE 216 /// assert!(!cpilsl.contains('\u{1_0000}')); 217 /// assert!(!cpilsl.contains('')); // U+1FA2B NEUTRAL CHESS TURNED QUEEN contains(&self, ch: char) -> bool218 pub fn contains(&self, ch: char) -> bool { 219 self.contains32(ch as u32) 220 } 221 222 /// Access the underlying [`CodePointInversionList`]. code_points(&self) -> &CodePointInversionList<'data>223 pub fn code_points(&self) -> &CodePointInversionList<'data> { 224 &self.cp_inv_list 225 } 226 227 /// Access the contained strings. strings(&self) -> &VarZeroSlice<str>228 pub fn strings(&self) -> &VarZeroSlice<str> { 229 &self.str_list 230 } 231 } 232 233 #[cfg(feature = "alloc")] 234 impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> { from_iter<I>(it: I) -> Self where I: IntoIterator<Item = &'a str>,235 fn from_iter<I>(it: I) -> Self 236 where 237 I: IntoIterator<Item = &'a str>, 238 { 239 let mut builder = CodePointInversionListBuilder::new(); 240 let mut strings = Vec::<&str>::new(); 241 for s in it { 242 let mut chars = s.chars(); 243 if let Some(first_char) = chars.next() { 244 if chars.next().is_none() { 245 builder.add_char(first_char); 246 continue; 247 } 248 } 249 strings.push(s); 250 } 251 252 // Ensure that the string list is sorted. If not, the binary search that 253 // is used for `.contains(&str)` will return garbage output. 254 strings.sort_unstable(); 255 strings.dedup(); 256 257 let cp_inv_list = builder.build(); 258 let str_list = VarZeroVec::<str>::from(&strings); 259 260 CodePointInversionListAndStringList { 261 cp_inv_list, 262 str_list, 263 } 264 } 265 } 266 267 /// Custom Errors for [`CodePointInversionListAndStringList`]. 268 #[derive(Display, Debug)] 269 pub enum InvalidStringList { 270 /// A string in the string list had an invalid length 271 #[cfg_attr(feature = "alloc", displaydoc("Invalid string length for string: {0}"))] 272 InvalidStringLength(#[cfg(feature = "alloc")] String), 273 /// A string in the string list appears more than once 274 #[cfg_attr(feature = "alloc", displaydoc("String list has duplicate: {0}"))] 275 StringListNotUnique(#[cfg(feature = "alloc")] String), 276 /// Two strings in the string list compare to each other opposite of sorted order 277 #[cfg_attr( 278 feature = "alloc", 279 displaydoc("Strings in string list not in sorted order: ({0}, {1})") 280 )] 281 StringListNotSorted( 282 #[cfg(feature = "alloc")] String, 283 #[cfg(feature = "alloc")] String, 284 ), 285 } 286 287 #[cfg(test)] 288 mod tests { 289 use super::*; 290 291 #[test] test_size_has_strings()292 fn test_size_has_strings() { 293 let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000]; 294 let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 295 let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"]; 296 let str_list = VarZeroVec::<str>::from(str_slice); 297 298 let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap(); 299 300 assert!(cpilsl.has_strings()); 301 assert_eq!(8, cpilsl.size()); 302 } 303 304 #[test] test_empty_string_allowed()305 fn test_empty_string_allowed() { 306 let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000]; 307 let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 308 let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"]; 309 let str_list = VarZeroVec::<str>::from(str_slice); 310 311 let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap(); 312 313 assert!(cpilsl.has_strings()); 314 assert_eq!(9, cpilsl.size()); 315 } 316 317 #[test] test_invalid_string()318 fn test_invalid_string() { 319 let cp_slice = &[0, 1]; 320 let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 321 let str_slice = &["a"]; 322 let str_list = VarZeroVec::<str>::from(str_slice); 323 324 let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list); 325 326 assert!(matches!( 327 cpilsl, 328 Err(InvalidStringList::InvalidStringLength(_)) 329 )); 330 } 331 332 #[test] test_invalid_string_list_has_duplicate()333 fn test_invalid_string_list_has_duplicate() { 334 let cp_slice = &[0, 1]; 335 let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 336 let str_slice = &["abc", "abc"]; 337 let str_list = VarZeroVec::<str>::from(str_slice); 338 339 let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list); 340 341 assert!(matches!( 342 cpilsl, 343 Err(InvalidStringList::StringListNotUnique(_)) 344 )); 345 } 346 347 #[test] test_invalid_string_list_not_sorted()348 fn test_invalid_string_list_not_sorted() { 349 let cp_slice = &[0, 1]; 350 let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap(); 351 let str_slice = &["xyz", "abc"]; 352 let str_list = VarZeroVec::<str>::from(str_slice); 353 354 let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list); 355 356 assert!(matches!( 357 cpilsl, 358 Err(InvalidStringList::StringListNotSorted(_, _)) 359 )); 360 } 361 362 #[test] test_from_iter_invariants()363 fn test_from_iter_invariants() { 364 let in_strs_1 = ["a", "abc", "xyz", "abc"]; 365 let in_strs_2 = ["xyz", "abc", "a", "abc"]; 366 367 let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1); 368 let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2); 369 370 assert_eq!(cpilsl_1, cpilsl_2); 371 372 assert!(cpilsl_1.has_strings()); 373 assert!(cpilsl_1.contains_str("abc")); 374 assert!(cpilsl_1.contains_str("xyz")); 375 assert!(!cpilsl_1.contains_str("def")); 376 377 assert_eq!(1, cpilsl_1.cp_inv_list.size()); 378 assert!(cpilsl_1.contains('a')); 379 assert!(!cpilsl_1.contains('0')); 380 assert!(!cpilsl_1.contains('q')); 381 382 assert_eq!(3, cpilsl_1.size()); 383 } 384 } 385