1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[cfg(feature = "serde")] 6 use alloc::format; 7 #[cfg(feature = "serde")] 8 use alloc::string::String; 9 #[cfg(feature = "alloc")] 10 use alloc::vec::Vec; 11 use core::{char, ops::RangeBounds, ops::RangeInclusive}; 12 use potential_utf::PotentialCodePoint; 13 use yoke::Yokeable; 14 use zerofrom::ZeroFrom; 15 use zerovec::{ule::AsULE, zerovec, ZeroVec}; 16 17 use super::InvalidSetError; 18 use crate::codepointinvlist::utils::{deconstruct_range, is_valid_zv}; 19 20 /// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0, inclusive 21 const BMP_MAX: u32 = 0xFFFF; 22 23 /// Represents the inversion list for a set of all code points in the Basic Multilingual Plane. 24 const BMP_INV_LIST_VEC: ZeroVec<PotentialCodePoint> = zerovec!(PotentialCodePoint; PotentialCodePoint::to_unaligned; [PotentialCodePoint::from_u24(0x0), PotentialCodePoint::from_u24(BMP_MAX + 1)]); 25 26 /// Represents the inversion list for all of the code points in the Unicode range. 27 const ALL_VEC: ZeroVec<PotentialCodePoint> = zerovec!(PotentialCodePoint; PotentialCodePoint::to_unaligned; [PotentialCodePoint::from_u24(0x0), PotentialCodePoint::from_u24((char::MAX as u32) + 1)]); 28 29 /// A membership wrapper for [`CodePointInversionList`]. 30 /// 31 /// Provides exposure to membership functions and constructors from serialized `CodePointSet`s (sets of code points) 32 /// and predefined ranges. 33 #[zerovec::make_varule(CodePointInversionListULE)] 34 #[zerovec::skip_derive(Ord)] 35 #[zerovec::derive(Debug)] 36 #[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)] 37 #[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))] 38 pub struct CodePointInversionList<'data> { 39 // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature 40 // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html 41 // Allows for traits of fixed size arrays 42 43 // Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list) 44 inv_list: ZeroVec<'data, PotentialCodePoint>, 45 size: u32, 46 } 47 48 #[cfg(feature = "serde")] 49 impl<'de: 'a, 'a> serde::Deserialize<'de> for CodePointInversionList<'a> { deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,50 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> 51 where 52 D: serde::Deserializer<'de>, 53 { 54 use serde::de::Error; 55 56 let parsed_inv_list = if deserializer.is_human_readable() { 57 let parsed_strings = Vec::<alloc::borrow::Cow<'de, str>>::deserialize(deserializer)?; 58 let mut inv_list = ZeroVec::new_owned(Vec::with_capacity(parsed_strings.len() * 2)); 59 for range in parsed_strings { 60 fn internal(range: &str) -> Option<(u32, u32)> { 61 let (start, range) = UnicodeCodePoint::parse(range)?; 62 if range.is_empty() { 63 return Some((start.0, start.0)); 64 } 65 let (hyphen, range) = UnicodeCodePoint::parse(range)?; 66 if hyphen.0 != '-' as u32 { 67 return None; 68 } 69 let (end, range) = UnicodeCodePoint::parse(range)?; 70 range.is_empty().then_some((start.0, end.0)) 71 } 72 let (start, end) = internal(&range).ok_or_else(|| Error::custom(format!( 73 "Cannot deserialize invalid inversion list for CodePointInversionList: {range:?}" 74 )))?; 75 inv_list.with_mut(|v| { 76 v.push(PotentialCodePoint::from_u24(start).to_unaligned()); 77 v.push(PotentialCodePoint::from_u24(end + 1).to_unaligned()); 78 }); 79 } 80 inv_list 81 } else { 82 ZeroVec::<PotentialCodePoint>::deserialize(deserializer)? 83 }; 84 CodePointInversionList::try_from_inversion_list(parsed_inv_list).map_err(|e| { 85 Error::custom(format!( 86 "Cannot deserialize invalid inversion list for CodePointInversionList: {e:?}" 87 )) 88 }) 89 } 90 } 91 92 #[cfg(feature = "databake")] 93 impl databake::Bake for CodePointInversionList<'_> { bake(&self, env: &databake::CrateEnv) -> databake::TokenStream94 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { 95 env.insert("icu_collections"); 96 let inv_list = self.inv_list.bake(env); 97 let size = self.size.bake(env); 98 // Safe because our parts are safe. 99 databake::quote! { unsafe { 100 #[allow(unused_unsafe)] 101 icu_collections::codepointinvlist::CodePointInversionList::from_parts_unchecked(#inv_list, #size) 102 }} 103 } 104 } 105 106 #[cfg(feature = "databake")] 107 impl databake::BakeSize for CodePointInversionList<'_> { borrows_size(&self) -> usize108 fn borrows_size(&self) -> usize { 109 self.inv_list.borrows_size() 110 } 111 } 112 113 #[cfg(feature = "serde")] 114 #[derive(Debug, Copy, Clone)] 115 struct UnicodeCodePoint(u32); 116 117 #[cfg(feature = "serde")] 118 impl UnicodeCodePoint { from_u32(cp: u32) -> Result<Self, String>119 fn from_u32(cp: u32) -> Result<Self, String> { 120 if cp <= char::MAX as u32 { 121 Ok(Self(cp)) 122 } else { 123 Err(format!("Not a Unicode code point {}", cp)) 124 } 125 } 126 parse(value: &str) -> Option<(Self, &str)>127 fn parse(value: &str) -> Option<(Self, &str)> { 128 Some(if let Some(hex) = value.strip_prefix("U+") { 129 let (escape, remainder) = (hex.get(..4)?, hex.get(4..)?); 130 (Self(u32::from_str_radix(escape, 16).ok()?), remainder) 131 } else { 132 let c = value.chars().next()?; 133 (Self(c as u32), value.get(c.len_utf8()..)?) 134 }) 135 } 136 } 137 138 #[cfg(feature = "serde")] 139 impl core::fmt::Display for UnicodeCodePoint { fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result140 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 141 match self.0 { 142 s @ 0xD800..=0xDFFF => write!(f, "U+{s:X}"), 143 // char should be in range by construction but this code is not so performance-sensitive 144 // so we just use the replacement character 145 c => write!( 146 f, 147 "{}", 148 char::from_u32(c).unwrap_or(char::REPLACEMENT_CHARACTER) 149 ), 150 } 151 } 152 } 153 154 #[cfg(feature = "serde")] 155 impl serde::Serialize for CodePointInversionList<'_> { serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,156 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> 157 where 158 S: serde::Serializer, 159 { 160 if serializer.is_human_readable() { 161 use serde::ser::Error; 162 use serde::ser::SerializeSeq; 163 let mut seq = serializer.serialize_seq(Some(self.inv_list.len() / 2))?; 164 for range in self.iter_ranges() { 165 let start = UnicodeCodePoint::from_u32(*range.start()).map_err(S::Error::custom)?; 166 if range.start() == range.end() { 167 seq.serialize_element(&format!("{start}"))?; 168 } else { 169 let end = UnicodeCodePoint::from_u32(*range.end()).map_err(S::Error::custom)?; 170 seq.serialize_element(&format!("{start}-{end}",))?; 171 } 172 } 173 seq.end() 174 } else { 175 // Note: serde(flatten) currently does not promote a struct field of type Vec 176 // to replace the struct when serializing. The error message from the default 177 // serialization is: "can only flatten structs and maps (got a sequence)". 178 self.inv_list.serialize(serializer) 179 } 180 } 181 } 182 183 impl<'data> CodePointInversionList<'data> { 184 /// Returns a new [`CodePointInversionList`] from an [inversion list](https://en.wikipedia.org/wiki/Inversion_list) 185 /// represented as a [`ZeroVec`]`<`[`PotentialCodePoint`]`>` of code points. 186 /// 187 /// The inversion list must be of even length, sorted ascending non-overlapping, 188 /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive. 189 /// 190 /// # Examples 191 /// 192 /// ``` 193 /// use icu::collections::codepointinvlist::CodePointInversionList; 194 /// use icu::collections::codepointinvlist::InvalidSetError; 195 /// use potential_utf::PotentialCodePoint; 196 /// use zerovec::ZeroVec; 197 /// 198 /// let valid = [0x0, 0x10000]; 199 /// let inv_list: ZeroVec<PotentialCodePoint> = valid 200 /// .into_iter() 201 /// .map(PotentialCodePoint::from_u24) 202 /// .collect(); 203 /// let result = CodePointInversionList::try_from_inversion_list(inv_list); 204 /// assert!(matches!(result, CodePointInversionList)); 205 /// 206 /// let invalid = vec![0x0, 0x80, 0x3]; 207 /// let inv_list: ZeroVec<PotentialCodePoint> = invalid 208 /// .iter() 209 /// .copied() 210 /// .map(PotentialCodePoint::from_u24) 211 /// .collect(); 212 /// let result = CodePointInversionList::try_from_inversion_list(inv_list); 213 /// assert!(matches!(result, Err(InvalidSetError(_)))); 214 /// if let Err(InvalidSetError(actual)) = result { 215 /// assert_eq!( 216 /// &invalid, 217 /// &actual.into_iter().map(u32::from).collect::<Vec<_>>() 218 /// ); 219 /// } 220 /// ``` try_from_inversion_list( inv_list: ZeroVec<'data, PotentialCodePoint>, ) -> Result<Self, InvalidSetError>221 pub fn try_from_inversion_list( 222 inv_list: ZeroVec<'data, PotentialCodePoint>, 223 ) -> Result<Self, InvalidSetError> { 224 #[allow(clippy::indexing_slicing)] // chunks 225 if is_valid_zv(&inv_list) { 226 let size = inv_list 227 .as_ule_slice() 228 .chunks(2) 229 .map(|end_points| { 230 u32::from(<PotentialCodePoint as AsULE>::from_unaligned(end_points[1])) 231 - u32::from(<PotentialCodePoint as AsULE>::from_unaligned(end_points[0])) 232 }) 233 .sum::<u32>(); 234 Ok(Self { inv_list, size }) 235 } else { 236 Err(InvalidSetError( 237 #[cfg(feature = "alloc")] 238 inv_list.to_vec(), 239 )) 240 } 241 } 242 243 /// Safety: no actual safety invariants, however has correctness invariants 244 #[doc(hidden)] // databake internal from_parts_unchecked( inv_list: ZeroVec<'data, PotentialCodePoint>, size: u32, ) -> Self245 pub const unsafe fn from_parts_unchecked( 246 inv_list: ZeroVec<'data, PotentialCodePoint>, 247 size: u32, 248 ) -> Self { 249 Self { inv_list, size } 250 } 251 252 /// Returns a new, fully-owned [`CodePointInversionList`] by cloning an [inversion list](https://en.wikipedia.org/wiki/Inversion_list) 253 /// represented as a slice of [`PotentialCodePoint`] code points. 254 /// 255 /// The inversion list must be of even length, sorted ascending non-overlapping, 256 /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive. 257 /// 258 /// # Examples 259 /// 260 /// ``` 261 /// use icu::collections::codepointinvlist::CodePointInversionList; 262 /// 263 /// let bmp_list = &[0x0, 0x10000]; 264 /// let smp_list = &[0x10000, 0x20000]; 265 /// let sip_list = &[0x20000, 0x30000]; 266 /// 267 /// let lists: Vec<CodePointInversionList> = 268 /// [&bmp_list[..], smp_list, sip_list] 269 /// .into_iter() 270 /// .map(|l| { 271 /// CodePointInversionList::try_from_u32_inversion_list_slice(l) 272 /// .unwrap() 273 /// }) 274 /// .collect(); 275 /// 276 /// let bmp = &lists[0]; 277 /// assert!(bmp.contains32(0xFFFF)); 278 /// assert!(!bmp.contains32(0x10000)); 279 /// 280 /// assert!(!lists.iter().any(|set| set.contains32(0x40000))); 281 /// ``` 282 #[cfg(feature = "alloc")] try_from_u32_inversion_list_slice(inv_list: &[u32]) -> Result<Self, InvalidSetError>283 pub fn try_from_u32_inversion_list_slice(inv_list: &[u32]) -> Result<Self, InvalidSetError> { 284 let inv_list_zv: ZeroVec<PotentialCodePoint> = inv_list 285 .iter() 286 .copied() 287 .map(PotentialCodePoint::from_u24) 288 .collect(); 289 CodePointInversionList::try_from_inversion_list(inv_list_zv) 290 } 291 292 /// Attempts to convert this list into a fully-owned one. No-op if already fully owned 293 #[cfg(feature = "alloc")] into_owned(self) -> CodePointInversionList<'static>294 pub fn into_owned(self) -> CodePointInversionList<'static> { 295 CodePointInversionList { 296 inv_list: self.inv_list.into_owned(), 297 size: self.size, 298 } 299 } 300 301 /// Returns an owned inversion list representing the current [`CodePointInversionList`] 302 #[cfg(feature = "alloc")] get_inversion_list_vec(&self) -> Vec<u32>303 pub fn get_inversion_list_vec(&self) -> Vec<u32> { 304 self.as_inversion_list().iter().map(u32::from).collect() 305 } 306 307 /// Returns [`CodePointInversionList`] spanning entire Unicode range 308 /// 309 /// The range spans from `0x0 -> 0x10FFFF` inclusive. 310 /// 311 /// # Examples 312 /// 313 /// ``` 314 /// use icu::collections::codepointinvlist::CodePointInversionList; 315 /// 316 /// let expected = [0x0, (char::MAX as u32) + 1]; 317 /// assert_eq!( 318 /// CodePointInversionList::all().get_inversion_list_vec(), 319 /// expected 320 /// ); 321 /// assert_eq!( 322 /// CodePointInversionList::all().size(), 323 /// (expected[1] - expected[0]) as usize 324 /// ); 325 /// ``` all() -> Self326 pub fn all() -> Self { 327 Self { 328 inv_list: ALL_VEC, 329 size: (char::MAX as u32) + 1, 330 } 331 } 332 333 /// Returns [`CodePointInversionList`] spanning BMP range 334 /// 335 /// The range spans from `0x0 -> 0xFFFF` inclusive. 336 /// 337 /// # Examples 338 /// 339 /// ``` 340 /// use icu::collections::codepointinvlist::CodePointInversionList; 341 /// 342 /// const BMP_MAX: u32 = 0xFFFF; 343 /// 344 /// let expected = [0x0, BMP_MAX + 1]; 345 /// assert_eq!( 346 /// CodePointInversionList::bmp().get_inversion_list_vec(), 347 /// expected 348 /// ); 349 /// assert_eq!( 350 /// CodePointInversionList::bmp().size(), 351 /// (expected[1] - expected[0]) as usize 352 /// ); 353 /// ``` bmp() -> Self354 pub fn bmp() -> Self { 355 Self { 356 inv_list: BMP_INV_LIST_VEC, 357 size: BMP_MAX + 1, 358 } 359 } 360 361 /// Returns the inversion list as a slice 362 /// 363 /// Public only to the crate, not exposed to public 364 #[cfg(feature = "alloc")] as_inversion_list(&self) -> &ZeroVec<PotentialCodePoint>365 pub(crate) fn as_inversion_list(&self) -> &ZeroVec<PotentialCodePoint> { 366 &self.inv_list 367 } 368 369 /// Yields an [`Iterator`] going through the character set in the [`CodePointInversionList`] 370 /// 371 /// # Examples 372 /// 373 /// ``` 374 /// use icu::collections::codepointinvlist::CodePointInversionList; 375 /// let example_list = [0x41, 0x44, 0x45, 0x46]; 376 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 377 /// &example_list, 378 /// ) 379 /// .unwrap(); 380 /// let mut ex_iter_chars = example.iter_chars(); 381 /// assert_eq!(Some('A'), ex_iter_chars.next()); 382 /// assert_eq!(Some('B'), ex_iter_chars.next()); 383 /// assert_eq!(Some('C'), ex_iter_chars.next()); 384 /// assert_eq!(Some('E'), ex_iter_chars.next()); 385 /// assert_eq!(None, ex_iter_chars.next()); 386 /// ``` iter_chars(&self) -> impl Iterator<Item = char> + '_387 pub fn iter_chars(&self) -> impl Iterator<Item = char> + '_ { 388 #[allow(clippy::indexing_slicing)] // chunks 389 self.inv_list 390 .as_ule_slice() 391 .chunks(2) 392 .flat_map(|pair| { 393 u32::from(PotentialCodePoint::from_unaligned(pair[0])) 394 ..u32::from(PotentialCodePoint::from_unaligned(pair[1])) 395 }) 396 .filter_map(char::from_u32) 397 } 398 399 /// Yields an [`Iterator`] returning the ranges of the code points that are 400 /// included in the [`CodePointInversionList`] 401 /// 402 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its 403 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J 404 /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`. 405 /// 406 /// # Example 407 /// 408 /// ``` 409 /// use icu::collections::codepointinvlist::CodePointInversionList; 410 /// let example_list = [0x41, 0x44, 0x45, 0x46]; 411 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 412 /// &example_list, 413 /// ) 414 /// .unwrap(); 415 /// let mut example_iter_ranges = example.iter_ranges(); 416 /// assert_eq!(Some(0x41..=0x43), example_iter_ranges.next()); 417 /// assert_eq!(Some(0x45..=0x45), example_iter_ranges.next()); 418 /// assert_eq!(None, example_iter_ranges.next()); 419 /// ``` iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_420 pub fn iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_ { 421 #[allow(clippy::indexing_slicing)] // chunks 422 self.inv_list.as_ule_slice().chunks(2).map(|pair| { 423 let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0])); 424 let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1])); 425 range_start..=(range_limit - 1) 426 }) 427 } 428 429 /// Yields an [`Iterator`] returning the ranges of the code points that are 430 /// *not* included in the [`CodePointInversionList`] 431 /// 432 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its 433 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J 434 /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`. 435 /// 436 /// # Example 437 /// 438 /// ``` 439 /// use icu::collections::codepointinvlist::CodePointInversionList; 440 /// let example_list = [0x41, 0x44, 0x45, 0x46]; 441 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 442 /// &example_list, 443 /// ) 444 /// .unwrap(); 445 /// let mut example_iter_ranges = example.iter_ranges_complemented(); 446 /// assert_eq!(Some(0..=0x40), example_iter_ranges.next()); 447 /// assert_eq!(Some(0x44..=0x44), example_iter_ranges.next()); 448 /// assert_eq!(Some(0x46..=char::MAX as u32), example_iter_ranges.next()); 449 /// assert_eq!(None, example_iter_ranges.next()); 450 /// ``` iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_451 pub fn iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ { 452 let inv_ule = self.inv_list.as_ule_slice(); 453 let middle = inv_ule.get(1..inv_ule.len() - 1).unwrap_or(&[]); 454 let beginning = if let Some(first) = self.inv_list.first() { 455 let first = u32::from(first); 456 if first == 0 { 457 None 458 } else { 459 Some(0..=first - 1) 460 } 461 } else { 462 None 463 }; 464 let end = if let Some(last) = self.inv_list.last() { 465 let last = u32::from(last); 466 if last == char::MAX as u32 { 467 None 468 } else { 469 Some(last..=char::MAX as u32) 470 } 471 } else { 472 None 473 }; 474 #[allow(clippy::indexing_slicing)] // chunks 475 let chunks = middle.chunks(2).map(|pair| { 476 let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0])); 477 let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1])); 478 range_start..=(range_limit - 1) 479 }); 480 beginning.into_iter().chain(chunks).chain(end) 481 } 482 483 /// Returns the number of ranges contained in this [`CodePointInversionList`] get_range_count(&self) -> usize484 pub fn get_range_count(&self) -> usize { 485 self.inv_list.len() / 2 486 } 487 488 /// Returns a specific range contained in this [`CodePointInversionList`] by index. 489 /// Intended for use in FFI. get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>>490 pub fn get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>> { 491 let start_idx = idx * 2; 492 let end_idx = start_idx + 1; 493 let start = u32::from(self.inv_list.get(start_idx)?); 494 let end = u32::from(self.inv_list.get(end_idx)?); 495 Some(start..=(end - 1)) 496 } 497 498 /// Returns the number of elements of the [`CodePointInversionList`] size(&self) -> usize499 pub fn size(&self) -> usize { 500 if self.is_empty() { 501 return 0; 502 } 503 self.size as usize 504 } 505 506 /// Returns whether or not the [`CodePointInversionList`] is empty is_empty(&self) -> bool507 pub fn is_empty(&self) -> bool { 508 self.inv_list.is_empty() 509 } 510 511 /// Wrapper for contains 512 /// 513 /// Returns an [`Option`] as to whether or not it is possible for the query to be contained. 514 /// The value in the [`Option`] is the start index of the range that contains the query. contains_query(&self, query: u32) -> Option<usize>515 fn contains_query(&self, query: u32) -> Option<usize> { 516 let query = PotentialCodePoint::try_from(query).ok()?; 517 match self.inv_list.binary_search(&query) { 518 Ok(pos) => { 519 if pos % 2 == 0 { 520 Some(pos) 521 } else { 522 None 523 } 524 } 525 Err(pos) => { 526 if pos % 2 != 0 && pos < self.inv_list.len() { 527 Some(pos - 1) 528 } else { 529 None 530 } 531 } 532 } 533 } 534 535 /// Checks to see the query is in the [`CodePointInversionList`] 536 /// 537 /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points 538 /// in the set using [`core`] implementation 539 /// 540 /// # Examples 541 /// 542 /// ``` 543 /// use icu::collections::codepointinvlist::CodePointInversionList; 544 /// let example_list = [0x41, 0x43, 0x44, 0x45]; 545 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 546 /// &example_list, 547 /// ) 548 /// .unwrap(); 549 /// assert!(example.contains('A')); 550 /// assert!(!example.contains('C')); 551 /// ``` contains(&self, query: char) -> bool552 pub fn contains(&self, query: char) -> bool { 553 self.contains_query(query as u32).is_some() 554 } 555 556 /// Checks to see the unsigned int is in the [`CodePointInversionList::all()`](CodePointInversionList::all()) 557 /// 558 /// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte 559 /// values, there is an important difference. A [`u32`] can take values up to 560 /// a very large integer value, while a [`prim@char`] in Rust is defined to be in 561 /// the range from 0 to the maximum valid Unicode Scalar Value. 562 /// 563 /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points 564 /// in the set using [`core`] implementation 565 /// 566 /// # Examples 567 /// 568 /// ``` 569 /// use icu::collections::codepointinvlist::CodePointInversionList; 570 /// let example_list = [0x41, 0x43, 0x44, 0x45]; 571 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 572 /// &example_list, 573 /// ) 574 /// .unwrap(); 575 /// assert!(example.contains32(0x41)); 576 /// assert!(!example.contains32(0x43)); 577 /// ``` contains32(&self, query: u32) -> bool578 pub fn contains32(&self, query: u32) -> bool { 579 self.contains_query(query).is_some() 580 } 581 582 /// Checks to see if the range is in the [`CodePointInversionList`] 583 /// 584 /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points 585 /// in the set using [`Vec`] implementation. Only runs the search once on the `start` 586 /// parameter, while the `end` parameter is checked in a single `O(1)` step. 587 /// 588 /// # Examples 589 /// 590 /// ``` 591 /// use icu::collections::codepointinvlist::CodePointInversionList; 592 /// let example_list = [0x41, 0x43, 0x44, 0x45]; 593 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 594 /// &example_list, 595 /// ) 596 /// .unwrap(); 597 /// assert!(example.contains_range('A'..'C')); 598 /// assert!(example.contains_range('A'..='B')); 599 /// assert!(!example.contains_range('A'..='C')); 600 /// ``` 601 /// 602 /// Surrogate points (`0xD800 -> 0xDFFF`) will return [`false`] if the Range contains them but the 603 /// [`CodePointInversionList`] does not. 604 /// 605 /// Note: when comparing to ICU4C/J, keep in mind that `Range`s in Rust are 606 /// constructed inclusive of start boundary and exclusive of end boundary. 607 /// The ICU4C/J `CodePointInversionList::contains(UChar32 start, UChar32 end)` method 608 /// differs by including the end boundary. 609 /// 610 /// # Examples 611 /// 612 /// ``` 613 /// use icu::collections::codepointinvlist::CodePointInversionList; 614 /// use std::char; 615 /// let check = 616 /// char::from_u32(0xD7FE).unwrap()..char::from_u32(0xE001).unwrap(); 617 /// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001]; 618 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 619 /// &example_list, 620 /// ) 621 /// .unwrap(); 622 /// assert!(!example.contains_range(check)); 623 /// ``` contains_range(&self, range: impl RangeBounds<char>) -> bool624 pub fn contains_range(&self, range: impl RangeBounds<char>) -> bool { 625 let (from, till) = deconstruct_range(range); 626 if from >= till { 627 return false; 628 } 629 match self.contains_query(from) { 630 Some(pos) => { 631 if let Some(x) = self.inv_list.get(pos + 1) { 632 (till) <= x.into() 633 } else { 634 debug_assert!( 635 false, 636 "Inversion list query should not return out of bounds index" 637 ); 638 false 639 } 640 } 641 None => false, 642 } 643 } 644 645 /// Check if the calling [`CodePointInversionList`] contains all the characters of the given [`CodePointInversionList`] 646 /// 647 /// # Examples 648 /// 649 /// ``` 650 /// use icu::collections::codepointinvlist::CodePointInversionList; 651 /// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z 652 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 653 /// &example_list, 654 /// ) 655 /// .unwrap(); 656 /// let a_to_d = CodePointInversionList::try_from_u32_inversion_list_slice(&[ 657 /// 0x41, 0x45, 658 /// ]) 659 /// .unwrap(); 660 /// let f_to_t = CodePointInversionList::try_from_u32_inversion_list_slice(&[ 661 /// 0x46, 0x55, 662 /// ]) 663 /// .unwrap(); 664 /// let r_to_x = CodePointInversionList::try_from_u32_inversion_list_slice(&[ 665 /// 0x52, 0x58, 666 /// ]) 667 /// .unwrap(); 668 /// assert!(example.contains_set(&a_to_d)); // contains all 669 /// assert!(!example.contains_set(&f_to_t)); // contains none 670 /// assert!(!example.contains_set(&r_to_x)); // contains some 671 /// ``` contains_set(&self, set: &Self) -> bool672 pub fn contains_set(&self, set: &Self) -> bool { 673 if set.size() > self.size() { 674 return false; 675 } 676 677 let mut set_ranges = set.iter_ranges(); 678 let mut check_elem = set_ranges.next(); 679 680 let ranges = self.iter_ranges(); 681 for range in ranges { 682 match check_elem { 683 Some(ref check_range) => { 684 if check_range.start() >= range.start() 685 && check_range.end() <= &(range.end() + 1) 686 { 687 check_elem = set_ranges.next(); 688 } 689 } 690 _ => break, 691 } 692 } 693 check_elem.is_none() 694 } 695 696 /// Returns the end of the initial substring where the characters are either contained/not contained 697 /// in the set. 698 /// 699 /// # Examples 700 /// 701 /// ``` 702 /// use icu::collections::codepointinvlist::CodePointInversionList; 703 /// let example_list = [0x41, 0x44]; // {A, B, C} 704 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 705 /// &example_list, 706 /// ) 707 /// .unwrap(); 708 /// assert_eq!(example.span("CABXYZ", true), 3); 709 /// assert_eq!(example.span("XYZC", false), 3); 710 /// assert_eq!(example.span("XYZ", true), 0); 711 /// assert_eq!(example.span("ABC", false), 0); 712 /// ``` span(&self, span_str: &str, contained: bool) -> usize713 pub fn span(&self, span_str: &str, contained: bool) -> usize { 714 span_str 715 .chars() 716 .take_while(|&x| self.contains(x) == contained) 717 .count() 718 } 719 720 /// Returns the start of the trailing substring (starting from end of string) where the characters are 721 /// either contained/not contained in the set. Returns the length of the string if no valid return. 722 /// 723 /// # Examples 724 /// 725 /// ``` 726 /// use icu::collections::codepointinvlist::CodePointInversionList; 727 /// let example_list = [0x41, 0x44]; // {A, B, C} 728 /// let example = CodePointInversionList::try_from_u32_inversion_list_slice( 729 /// &example_list, 730 /// ) 731 /// .unwrap(); 732 /// assert_eq!(example.span_back("XYZCAB", true), 3); 733 /// assert_eq!(example.span_back("ABCXYZ", true), 6); 734 /// assert_eq!(example.span_back("CABXYZ", false), 3); 735 /// ``` span_back(&self, span_str: &str, contained: bool) -> usize736 pub fn span_back(&self, span_str: &str, contained: bool) -> usize { 737 span_str.len() 738 - span_str 739 .chars() 740 .rev() 741 .take_while(|&x| self.contains(x) == contained) 742 .count() 743 } 744 } 745 746 #[cfg(test)] 747 mod tests { 748 use super::{CodePointInversionList, InvalidSetError}; 749 use std::{char, vec::Vec}; 750 use zerovec::ZeroVec; 751 752 #[test] test_codepointinversionlist_try_from_vec()753 fn test_codepointinversionlist_try_from_vec() { 754 let ex = vec![0x2, 0x3, 0x4, 0x5]; 755 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 756 assert_eq!(ex, check.get_inversion_list_vec()); 757 assert_eq!(2, check.size()); 758 } 759 760 #[test] test_codepointinversionlist_try_from_vec_error()761 fn test_codepointinversionlist_try_from_vec_error() { 762 let check = vec![0x1, 0x1, 0x2, 0x3, 0x4]; 763 let set = CodePointInversionList::try_from_u32_inversion_list_slice(&check); 764 assert!(matches!(set, Err(InvalidSetError(_)))); 765 if let Err(InvalidSetError(actual)) = set { 766 assert_eq!( 767 &check, 768 &actual.into_iter().map(u32::from).collect::<Vec<_>>() 769 ); 770 } 771 } 772 773 // CodePointInversionList membership functions 774 #[test] test_codepointinversionlist_contains_query()775 fn test_codepointinversionlist_contains_query() { 776 let ex = vec![0x41, 0x46, 0x4B, 0x55]; 777 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 778 assert!(check.contains_query(0x40).is_none()); 779 assert_eq!(check.contains_query(0x41).unwrap(), 0); 780 assert_eq!(check.contains_query(0x44).unwrap(), 0); 781 assert!(check.contains_query(0x46).is_none()); 782 assert_eq!(check.contains_query(0x4C).unwrap(), 2); 783 assert!(check.contains_query(0x56).is_none()); 784 } 785 786 #[test] test_codepointinversionlist_contains()787 fn test_codepointinversionlist_contains() { 788 let ex = vec![0x2, 0x5, 0xA, 0xF]; 789 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 790 assert!(check.contains(0x2 as char)); 791 assert!(check.contains(0x4 as char)); 792 assert!(check.contains(0xA as char)); 793 assert!(check.contains(0xE as char)); 794 } 795 796 #[test] test_codepointinversionlist_contains_false()797 fn test_codepointinversionlist_contains_false() { 798 let ex = vec![0x2, 0x5, 0xA, 0xF]; 799 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 800 assert!(!check.contains(0x1 as char)); 801 assert!(!check.contains(0x5 as char)); 802 assert!(!check.contains(0x9 as char)); 803 assert!(!check.contains(0xF as char)); 804 assert!(!check.contains(0x10 as char)); 805 } 806 807 #[test] test_codepointinversionlist_contains_range()808 fn test_codepointinversionlist_contains_range() { 809 let ex = vec![0x41, 0x46, 0x4B, 0x55]; 810 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 811 assert!(check.contains_range('A'..='E')); // 65 - 69 812 assert!(check.contains_range('C'..'D')); // 67 - 67 813 assert!(check.contains_range('L'..'P')); // 76 - 80 814 assert!(!check.contains_range('L'..='U')); // 76 - 85 815 } 816 817 #[test] test_codepointinversionlist_contains_range_false()818 fn test_codepointinversionlist_contains_range_false() { 819 let ex = vec![0x41, 0x46, 0x4B, 0x55]; 820 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 821 assert!(!check.contains_range('!'..'A')); // 33 - 65 822 assert!(!check.contains_range('F'..'K')); // 70 - 74 823 assert!(!check.contains_range('U'..)); // 85 - .. 824 } 825 826 #[test] test_codepointinversionlist_contains_range_invalid()827 fn test_codepointinversionlist_contains_range_invalid() { 828 let check = CodePointInversionList::all(); 829 assert!(!check.contains_range('A'..'!')); // 65 - 33 830 assert!(!check.contains_range('A'..'A')); // 65 - 65 831 } 832 833 #[test] test_codepointinversionlist_contains_set_u()834 fn test_codepointinversionlist_contains_set_u() { 835 let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x6E]; 836 let u = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 837 let inside = vec![0xF, 0x14, 0x2C, 0x31, 0x46, 0x50, 0x64, 0x6D]; 838 let s = CodePointInversionList::try_from_u32_inversion_list_slice(&inside).unwrap(); 839 assert!(u.contains_set(&s)); 840 } 841 842 #[test] test_codepointinversionlist_contains_set_u_false()843 fn test_codepointinversionlist_contains_set_u_false() { 844 let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x78]; 845 let u = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 846 let outside = vec![0x0, 0xA, 0x16, 0x2C, 0x32, 0x46, 0x4F, 0x51, 0x6D, 0x6F]; 847 let s = CodePointInversionList::try_from_u32_inversion_list_slice(&outside).unwrap(); 848 assert!(!u.contains_set(&s)); 849 } 850 851 #[test] test_codepointinversionlist_size()852 fn test_codepointinversionlist_size() { 853 let ex = vec![0x2, 0x5, 0xA, 0xF]; 854 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 855 assert_eq!(8, check.size()); 856 let check = CodePointInversionList::all(); 857 let expected = (char::MAX as u32) + 1; 858 assert_eq!(expected as usize, check.size()); 859 let inv_list_vec = vec![]; 860 let check = CodePointInversionList { 861 inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec), 862 size: 0, 863 }; 864 assert_eq!(check.size(), 0); 865 } 866 867 #[test] test_codepointinversionlist_is_empty()868 fn test_codepointinversionlist_is_empty() { 869 let inv_list_vec = vec![]; 870 let check = CodePointInversionList { 871 inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec), 872 size: 0, 873 }; 874 assert!(check.is_empty()); 875 } 876 877 #[test] test_codepointinversionlist_is_not_empty()878 fn test_codepointinversionlist_is_not_empty() { 879 let check = CodePointInversionList::all(); 880 assert!(!check.is_empty()); 881 } 882 883 #[test] test_codepointinversionlist_iter_chars()884 fn test_codepointinversionlist_iter_chars() { 885 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801]; 886 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 887 let mut iter = check.iter_chars(); 888 assert_eq!(Some('A'), iter.next()); 889 assert_eq!(Some('B'), iter.next()); 890 assert_eq!(Some('C'), iter.next()); 891 assert_eq!(Some('E'), iter.next()); 892 assert_eq!(None, iter.next()); 893 } 894 895 #[test] test_codepointinversionlist_iter_ranges()896 fn test_codepointinversionlist_iter_ranges() { 897 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801]; 898 let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 899 let mut ranges = set.iter_ranges(); 900 assert_eq!(Some(0x41..=0x43), ranges.next()); 901 assert_eq!(Some(0x45..=0x45), ranges.next()); 902 assert_eq!(Some(0xD800..=0xD800), ranges.next()); 903 assert_eq!(None, ranges.next()); 904 } 905 906 #[test] test_codepointinversionlist_iter_ranges_exactsizeiter_trait()907 fn test_codepointinversionlist_iter_ranges_exactsizeiter_trait() { 908 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801]; 909 let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 910 let ranges = set.iter_ranges(); 911 assert_eq!(3, ranges.len()); 912 } 913 914 #[test] test_codepointinversionlist_range_count()915 fn test_codepointinversionlist_range_count() { 916 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801]; 917 let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 918 assert_eq!(3, set.get_range_count()); 919 } 920 921 #[test] test_codepointinversionlist_get_nth_range()922 fn test_codepointinversionlist_get_nth_range() { 923 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801]; 924 let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 925 assert_eq!(Some(0x41..=0x43), set.get_nth_range(0)); 926 assert_eq!(Some(0x45..=0x45), set.get_nth_range(1)); 927 assert_eq!(Some(0xD800..=0xD800), set.get_nth_range(2)); 928 assert_eq!(None, set.get_nth_range(3)); 929 } 930 931 // Range<char> cannot represent the upper bound (non-inclusive) for 932 // char::MAX, whereas Range<u32> can. 933 #[test] test_codepointinversionlist_iter_ranges_with_max_code_point()934 fn test_codepointinversionlist_iter_ranges_with_max_code_point() { 935 let ex = vec![0x80, (char::MAX as u32) + 1]; 936 let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 937 let mut ranges = set.iter_ranges(); 938 assert_eq!(Some(0x80..=(char::MAX as u32)), ranges.next()); 939 assert_eq!(None, ranges.next()); 940 } 941 942 #[test] test_codepointinversionlist_span_contains()943 fn test_codepointinversionlist_span_contains() { 944 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K 945 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 946 assert_eq!(check.span("ABCDE", true), 3); 947 assert_eq!(check.span("E", true), 0); 948 } 949 950 #[test] test_codepointinversionlist_span_does_not_contain()951 fn test_codepointinversionlist_span_does_not_contain() { 952 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K 953 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 954 assert_eq!(check.span("DEF", false), 2); 955 assert_eq!(check.span("KLMA", false), 3); 956 } 957 958 #[test] test_codepointinversionlist_span_back_contains()959 fn test_codepointinversionlist_span_back_contains() { 960 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K 961 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 962 assert_eq!(check.span_back("XYZABFH", true), 3); 963 assert_eq!(check.span_back("ABCXYZ", true), 6); 964 } 965 966 #[test] test_codepointinversionlist_span_back_does_not_contain()967 fn test_codepointinversionlist_span_back_does_not_contain() { 968 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K 969 let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap(); 970 assert_eq!(check.span_back("ABCXYZ", false), 3); 971 assert_eq!(check.span_back("XYZABC", false), 6); 972 } 973 974 #[test] test_uniset_to_inv_list()975 fn test_uniset_to_inv_list() { 976 let inv_list = [ 977 0x9, 0xE, 0x20, 0x21, 0x85, 0x86, 0xA0, 0xA1, 0x1626, 0x1627, 0x2000, 0x2003, 0x2028, 978 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 979 ]; 980 let s: CodePointInversionList = 981 CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap(); 982 let round_trip_inv_list = s.get_inversion_list_vec(); 983 assert_eq!( 984 round_trip_inv_list.into_iter().collect::<ZeroVec<u32>>(), 985 inv_list 986 ); 987 } 988 989 #[test] test_serde_serialize()990 fn test_serde_serialize() { 991 let inv_list = [0x41, 0x46, 0x4B, 0x55]; 992 let uniset = CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap(); 993 let json_str = serde_json::to_string(&uniset).unwrap(); 994 assert_eq!(json_str, r#"["A-E","K-T"]"#); 995 } 996 997 #[test] test_serde_serialize_surrogates()998 fn test_serde_serialize_surrogates() { 999 let inv_list = [0xDFAB, 0xDFFF]; 1000 let uniset = CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap(); 1001 let json_str = serde_json::to_string(&uniset).unwrap(); 1002 assert_eq!(json_str, r#"["U+DFAB-U+DFFE"]"#); 1003 } 1004 1005 #[test] test_serde_deserialize()1006 fn test_serde_deserialize() { 1007 let inv_list_str = r#"["A-E","K-T"]"#; 1008 let exp_inv_list = [0x41, 0x46, 0x4B, 0x55]; 1009 let exp_uniset = 1010 CodePointInversionList::try_from_u32_inversion_list_slice(&exp_inv_list).unwrap(); 1011 let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap(); 1012 assert_eq!(act_uniset, exp_uniset); 1013 } 1014 1015 #[test] test_serde_deserialize_surrogates()1016 fn test_serde_deserialize_surrogates() { 1017 let inv_list_str = r#"["U+DFAB-U+DFFE"]"#; 1018 let exp_inv_list = [0xDFAB, 0xDFFF]; 1019 let exp_uniset = 1020 CodePointInversionList::try_from_u32_inversion_list_slice(&exp_inv_list).unwrap(); 1021 let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap(); 1022 assert_eq!(act_uniset, exp_uniset); 1023 } 1024 1025 #[test] test_serde_deserialize_invalid()1026 fn test_serde_deserialize_invalid() { 1027 assert!(serde_json::from_str::<CodePointInversionList>("[65,70,98775,85]").is_err()); 1028 assert!(serde_json::from_str::<CodePointInversionList>("[65,70,U+FFFFFFFFFF,85]").is_err()); 1029 } 1030 1031 #[test] test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error>1032 fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> { 1033 let set = CodePointInversionList::bmp(); 1034 let set_serialized: Vec<u8> = postcard::to_allocvec(&set).unwrap(); 1035 let set_deserialized: CodePointInversionList = 1036 postcard::from_bytes::<CodePointInversionList>(&set_serialized)?; 1037 1038 assert_eq!(&set, &set_deserialized); 1039 assert!(!set_deserialized.inv_list.is_owned()); 1040 1041 Ok(()) 1042 } 1043 1044 #[test] databake()1045 fn databake() { 1046 databake::test_bake!( 1047 CodePointInversionList<'static>, 1048 const, 1049 unsafe { 1050 #[allow(unused_unsafe)] 1051 crate::codepointinvlist::CodePointInversionList::from_parts_unchecked( 1052 unsafe { 1053 zerovec::ZeroVec::from_bytes_unchecked( 1054 b"0\0\0\0:\0\0\0A\0\0\0G\0\0\0a\0\0\0g\0\0\0", 1055 ) 1056 }, 1057 22u32, 1058 ) 1059 }, 1060 icu_collections, 1061 [zerovec], 1062 ); 1063 } 1064 } 1065