1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 use crate::codepointtrie::error::Error;
6 use crate::codepointtrie::impl_const::*;
7
8 #[cfg(feature = "alloc")]
9 use crate::codepointinvlist::CodePointInversionList;
10 use core::char::CharTryFromError;
11 use core::convert::Infallible;
12 use core::convert::TryFrom;
13 use core::fmt::Display;
14 #[cfg(feature = "alloc")]
15 use core::iter::FromIterator;
16 use core::num::TryFromIntError;
17 use core::ops::RangeInclusive;
18 use yoke::Yokeable;
19 use zerofrom::ZeroFrom;
20 #[cfg(feature = "alloc")]
21 use zerovec::ule::UleError;
22 use zerovec::ZeroVec;
23
24 /// The type of trie represents whether the trie has an optimization that
25 /// would make it smaller or faster.
26 ///
27 /// Regarding performance, a trie being a small or fast type affects the number of array lookups
28 /// needed for code points in the range `[0x1000, 0x10000)`. In this range, `Small` tries use 4 array lookups,
29 /// while `Fast` tries use 2 array lookups.
30 /// Code points before the interval (in `[0, 0x1000)`) will always use 2 array lookups.
31 /// Code points after the interval (in `[0x10000, 0x10FFFF]`) will always use 4 array lookups.
32 ///
33 /// Regarding size, `Fast` type tries are larger than `Small` type tries because the minimum size of
34 /// the index array is larger. The minimum size is the "fast max" limit, which is the limit of the range
35 /// of code points with 2 array lookups.
36 ///
37 /// See the document [Unicode Properties and Code Point Tries in ICU4X](https://github.com/unicode-org/icu4x/blob/main/documents/design/properties_code_point_trie.md).
38 ///
39 /// Also see [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
40 #[derive(Clone, Copy, PartialEq, Debug, Eq)]
41 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
42 #[cfg_attr(feature = "databake", derive(databake::Bake))]
43 #[cfg_attr(feature = "databake", databake(path = icu_collections::codepointtrie))]
44 pub enum TrieType {
45 /// Represents the "fast" type code point tries for the
46 /// [`TrieType`] trait. The "fast max" limit is set to `0xffff`.
47 Fast = 0,
48 /// Represents the "small" type code point tries for the
49 /// [`TrieType`] trait. The "fast max" limit is set to `0x0fff`.
50 Small = 1,
51 }
52
53 // TrieValue trait
54
55 // AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec"
56
57 /// A trait representing the values stored in the data array of a [`CodePointTrie`].
58 /// This trait is used as a type parameter in constructing a `CodePointTrie`.
59 ///
60 /// This trait can be implemented on anything that can be represented as a u32s worth of data.
61 pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static {
62 /// Last-resort fallback value to return if we cannot read data from the trie.
63 ///
64 /// In most cases, the error value is read from the last element of the `data` array,
65 /// this value is used for empty codepointtrie arrays
66 /// Error type when converting from a u32 to this `TrieValue`.
67 type TryFromU32Error: Display;
68 /// A parsing function that is primarily motivated by deserialization contexts.
69 /// When the serialization type width is smaller than 32 bits, then it is expected
70 /// that the call site will widen the value to a `u32` first.
try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>71 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>;
72
73 /// A method for converting back to a `u32` that can roundtrip through
74 /// [`Self::try_from_u32()`]. The default implementation of this trait
75 /// method panics in debug mode and returns 0 in release mode.
76 ///
77 /// This method is allowed to have GIGO behavior when fed a value that has
78 /// no corresponding `u32` (since such values cannot be stored in the trie)
to_u32(self) -> u3279 fn to_u32(self) -> u32;
80 }
81
82 macro_rules! impl_primitive_trie_value {
83 ($primitive:ty, $error:ty) => {
84 impl TrieValue for $primitive {
85 type TryFromU32Error = $error;
86 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
87 Self::try_from(i)
88 }
89
90 fn to_u32(self) -> u32 {
91 // bitcast when the same size, zero-extend/sign-extend
92 // when not the same size
93 self as u32
94 }
95 }
96 };
97 }
98
99 impl_primitive_trie_value!(u8, TryFromIntError);
100 impl_primitive_trie_value!(u16, TryFromIntError);
101 impl_primitive_trie_value!(u32, Infallible);
102 impl_primitive_trie_value!(i8, TryFromIntError);
103 impl_primitive_trie_value!(i16, TryFromIntError);
104 impl_primitive_trie_value!(i32, TryFromIntError);
105 impl_primitive_trie_value!(char, CharTryFromError);
106
107 /// Helper function used by [`get_range`]. Converts occurrences of trie's null
108 /// value into the provided `null_value`.
109 ///
110 /// Note: the ICU version of this helper function uses a `ValueFilter` function
111 /// to apply a transform on a non-null value. But currently, this implementation
112 /// stops short of that functionality, and instead leaves the non-null trie value
113 /// untouched. This is equivalent to having a `ValueFilter` function that is the
114 /// identity function.
maybe_filter_value<T: TrieValue>(value: T, trie_null_value: T, null_value: T) -> T115 fn maybe_filter_value<T: TrieValue>(value: T, trie_null_value: T, null_value: T) -> T {
116 if value == trie_null_value {
117 null_value
118 } else {
119 value
120 }
121 }
122
123 /// This struct represents a de-serialized [`CodePointTrie`] that was exported from
124 /// ICU binary data.
125 ///
126 /// For more information:
127 /// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie)
128 /// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup)
129 // serde impls in crate::serde
130 #[derive(Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
131 pub struct CodePointTrie<'trie, T: TrieValue> {
132 pub(crate) header: CodePointTrieHeader,
133 pub(crate) index: ZeroVec<'trie, u16>,
134 pub(crate) data: ZeroVec<'trie, T>,
135 // serde impl skips this field
136 #[zerofrom(clone)] // TrieValue is Copy, this allows us to avoid
137 // a T: ZeroFrom bound
138 pub(crate) error_value: T,
139 }
140
141 /// This struct contains the fixed-length header fields of a [`CodePointTrie`].
142 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
143 #[cfg_attr(feature = "databake", derive(databake::Bake))]
144 #[cfg_attr(feature = "databake", databake(path = icu_collections::codepointtrie))]
145 #[derive(Copy, Clone, Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
146 pub struct CodePointTrieHeader {
147 /// The code point of the start of the last range of the trie. A
148 /// range is defined as a partition of the code point space such that the
149 /// value in this trie associated with all code points of the same range is
150 /// the same.
151 ///
152 /// For the property value data for many Unicode properties,
153 /// often times, `high_start` is `U+10000` or lower. In such cases, not
154 /// reserving space in the `index` array for duplicate values is a large
155 /// savings. The "highValue" associated with the `high_start` range is
156 /// stored at the second-to-last position of the `data` array.
157 /// (See `impl_const::HIGH_VALUE_NEG_DATA_OFFSET`.)
158 pub high_start: u32,
159 /// A version of the `high_start` value that is right-shifted 12 spaces,
160 /// but is rounded up to a multiple `0x1000` for easy testing from UTF-8
161 /// lead bytes.
162 pub shifted12_high_start: u16,
163 /// Offset for the null block in the "index-3" table of the `index` array.
164 /// Set to an impossibly high value (e.g., `0xffff`) if there is no
165 /// dedicated index-3 null block.
166 pub index3_null_offset: u16,
167 /// Internal data null block offset, not shifted.
168 /// Set to an impossibly high value (e.g., `0xfffff`) if there is no
169 /// dedicated data null block.
170 pub data_null_offset: u32,
171 /// The value stored in the trie that represents a null value being
172 /// associated to a code point.
173 pub null_value: u32,
174 /// The enum value representing the type of trie, where trie type is as it
175 /// is defined in ICU (ex: Fast, Small).
176 pub trie_type: TrieType,
177 }
178
179 impl TryFrom<u8> for TrieType {
180 type Error = crate::codepointtrie::error::Error;
181
try_from(trie_type_int: u8) -> Result<TrieType, crate::codepointtrie::error::Error>182 fn try_from(trie_type_int: u8) -> Result<TrieType, crate::codepointtrie::error::Error> {
183 match trie_type_int {
184 0 => Ok(TrieType::Fast),
185 1 => Ok(TrieType::Small),
186 _ => Err(crate::codepointtrie::error::Error::FromDeserialized {
187 reason: "Cannot parse value for trie_type",
188 }),
189 }
190 }
191 }
192
193 // Helper macro that turns arithmetic into wrapping-in-release, checked-in-debug arithmetic
194 //
195 // This is rustc's default behavior anyway, however some projects like Android deliberately
196 // enable overflow checks. CodePointTrie::get() is intended to be used in Android bionic which
197 // cares about codesize and we don't want the pile of panicking infrastructure brought in by overflow
198 // checks, so we force wrapping in release.
199 // See #6052
200 macro_rules! w(
201 // Note: these matchers are not perfect since you cannot have an operator after an expr matcher
202 // Use variables if you need complex first operands.
203 ($a:tt + $b:expr) => {
204 {
205 #[allow(unused_parens)]
206 let a = $a;
207 let b = $b;
208 debug_assert!(a.checked_add(b).is_some());
209 $a.wrapping_add($b)
210 }
211 };
212 ($a:tt - $b:expr) => {
213
214 {
215 #[allow(unused_parens)]
216 let a = $a;
217 let b = $b;
218 debug_assert!(a.checked_sub(b).is_some());
219 $a.wrapping_sub($b)
220 }
221 };
222 ($a:tt * $b:expr) => {
223 {
224 #[allow(unused_parens)]
225 let a = $a;
226 let b = $b;
227 debug_assert!(a.checked_mul(b).is_some());
228 $a.wrapping_mul($b)
229 }
230 };
231 );
232
233 impl<'trie, T: TrieValue> CodePointTrie<'trie, T> {
234 #[doc(hidden)] // databake internal
from_parts( header: CodePointTrieHeader, index: ZeroVec<'trie, u16>, data: ZeroVec<'trie, T>, error_value: T, ) -> Self235 pub const fn from_parts(
236 header: CodePointTrieHeader,
237 index: ZeroVec<'trie, u16>,
238 data: ZeroVec<'trie, T>,
239 error_value: T,
240 ) -> Self {
241 Self {
242 header,
243 index,
244 data,
245 error_value,
246 }
247 }
248
249 /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index`
250 /// array and `data` array, whose data values have width `W`.
try_new( header: CodePointTrieHeader, index: ZeroVec<'trie, u16>, data: ZeroVec<'trie, T>, ) -> Result<CodePointTrie<'trie, T>, Error>251 pub fn try_new(
252 header: CodePointTrieHeader,
253 index: ZeroVec<'trie, u16>,
254 data: ZeroVec<'trie, T>,
255 ) -> Result<CodePointTrie<'trie, T>, Error> {
256 // Validation invariants are not needed here when constructing a new
257 // `CodePointTrie` because:
258 //
259 // - Rust includes the size of a slice (or Vec or similar), which allows it
260 // to prevent lookups at out-of-bounds indices, whereas in C++, it is the
261 // programmer's responsibility to keep track of length info.
262 // - For lookups into collections, Rust guarantees that a fallback value will
263 // be returned in the case of `.get()` encountering a lookup error, via
264 // the `Option` type.
265 // - The `ZeroVec` serializer stores the length of the array along with the
266 // ZeroVec data, meaning that a deserializer would also see that length info.
267
268 let error_value = data.last().ok_or(Error::EmptyDataVector)?;
269 let trie: CodePointTrie<'trie, T> = CodePointTrie {
270 header,
271 index,
272 data,
273 error_value,
274 };
275 Ok(trie)
276 }
277
278 /// Returns the position in the data array containing the trie's stored
279 /// error value.
280 #[inline(always)] // `always` based on normalizer benchmarking
trie_error_val_index(&self) -> u32281 fn trie_error_val_index(&self) -> u32 {
282 // We use wrapping_sub here to avoid panicky overflow checks.
283 // len should always be > 1, but if it isn't this will just cause GIGO behavior of producing
284 // None on `.get()`
285 debug_assert!(self.data.len() as u32 >= ERROR_VALUE_NEG_DATA_OFFSET);
286 w!((self.data.len() as u32) - ERROR_VALUE_NEG_DATA_OFFSET)
287 }
288
internal_small_index(&self, code_point: u32) -> u32289 fn internal_small_index(&self, code_point: u32) -> u32 {
290 // We use wrapping arithmetic here to avoid overflow checks making their way into binaries
291 // with overflow checks enabled. Ultimately this code ends up as a checked index, so any
292 // bugs here will cause GIGO
293 let mut index1_pos: u32 = code_point >> SHIFT_1;
294 if self.header.trie_type == TrieType::Fast {
295 debug_assert!(
296 FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start
297 );
298 index1_pos = w!(index1_pos + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH);
299 } else {
300 assert!(code_point < self.header.high_start && self.header.high_start > SMALL_LIMIT);
301 index1_pos = w!(index1_pos + SMALL_INDEX_LENGTH);
302 }
303 let index1_val = if let Some(index1_val) = self.index.get(index1_pos as usize) {
304 index1_val
305 } else {
306 return self.trie_error_val_index();
307 };
308 let index3_block_idx: u32 =
309 w!((index1_val as u32) + (code_point >> SHIFT_2) & INDEX_2_MASK);
310 let mut index3_block: u32 =
311 if let Some(index3_block) = self.index.get(index3_block_idx as usize) {
312 index3_block as u32
313 } else {
314 return self.trie_error_val_index();
315 };
316 let mut index3_pos: u32 = (code_point >> SHIFT_3) & INDEX_3_MASK;
317 let mut data_block: u32;
318 if index3_block & 0x8000 == 0 {
319 // 16-bit indexes
320 data_block =
321 if let Some(data_block) = self.index.get(w!(index3_block + index3_pos) as usize) {
322 data_block as u32
323 } else {
324 return self.trie_error_val_index();
325 };
326 } else {
327 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
328 index3_block = w!((index3_block & 0x7fff) + w!((index3_pos & !7) + index3_pos >> 3));
329 index3_pos &= 7;
330 data_block = if let Some(data_block) = self.index.get(index3_block as usize) {
331 data_block as u32
332 } else {
333 return self.trie_error_val_index();
334 };
335 data_block = (data_block << w!(2u32 + w!(2u32 * index3_pos))) & 0x30000;
336 index3_block += 1;
337 data_block =
338 if let Some(index3_val) = self.index.get(w!(index3_block + index3_pos) as usize) {
339 data_block | (index3_val as u32)
340 } else {
341 return self.trie_error_val_index();
342 };
343 }
344 // Returns data_pos == data_block (offset) +
345 // portion of code_point bit field for last (4th) lookup
346 w!(data_block + code_point & SMALL_DATA_MASK)
347 }
348
349 /// Returns the position in the `data` array for the given code point,
350 /// where this code point is at or above the fast limit associated for the
351 /// `trie_type`. We will refer to that limit as "`fastMax`" here.
352 ///
353 /// A lookup of the value in the code point trie for a code point in the
354 /// code point space range [`fastMax`, `high_start`) will be a 4-step
355 /// lookup: 3 lookups in the `index` array and one lookup in the `data`
356 /// array. Lookups for code points in the range [`high_start`,
357 /// `CODE_POINT_MAX`] are short-circuited to be a single lookup, see
358 /// [`CodePointTrieHeader::high_start`].
small_index(&self, code_point: u32) -> u32359 fn small_index(&self, code_point: u32) -> u32 {
360 if code_point >= self.header.high_start {
361 w!((self.data.len() as u32) - HIGH_VALUE_NEG_DATA_OFFSET)
362 } else {
363 self.internal_small_index(code_point) // helper fn
364 }
365 }
366
367 /// Returns the position in the `data` array for the given code point,
368 /// where this code point is below the fast limit associated for the
369 /// `trie type`. We will refer to that limit as "`fastMax`" here.
370 ///
371 /// A lookup of the value in the code point trie for a code point in the
372 /// code point space range [0, `fastMax`) will be a 2-step lookup: 1
373 /// lookup in the `index` array and one lookup in the `data` array. By
374 /// design, for trie type `T`, there is an element allocated in the `index`
375 /// array for each block of code points in [0, `fastMax`), which in
376 /// turn guarantees that those code points are represented and only need 1
377 /// lookup.
378 #[inline(always)] // `always` based on normalizer benchmarking
fast_index(&self, code_point: u32) -> u32379 fn fast_index(&self, code_point: u32) -> u32 {
380 let index_array_pos: u32 = code_point >> FAST_TYPE_SHIFT;
381 let index_array_val: u16 =
382 if let Some(index_array_val) = self.index.get(index_array_pos as usize) {
383 index_array_val
384 } else {
385 return self.trie_error_val_index();
386 };
387 let fast_index_val: u32 = w!((index_array_val as u32) + code_point & FAST_TYPE_DATA_MASK);
388 fast_index_val
389 }
390
391 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`].
392 ///
393 /// # Examples
394 ///
395 /// ```
396 /// use icu::collections::codepointtrie::planes;
397 /// let trie = planes::get_planes_trie();
398 ///
399 /// assert_eq!(0, trie.get32(0x41)); // 'A' as u32
400 /// assert_eq!(0, trie.get32(0x13E0)); // 'Ꮰ' as u32
401 /// assert_eq!(1, trie.get32(0x10044)); // '' as u32
402 /// ```
403 #[inline(always)] // `always` based on normalizer benchmarking
get32(&self, code_point: u32) -> T404 pub fn get32(&self, code_point: u32) -> T {
405 // If we cannot read from the data array, then return the sentinel value
406 // self.error_value() for the instance type for T: TrieValue.
407 self.get32_ule(code_point)
408 .map(|t| T::from_unaligned(*t))
409 .unwrap_or(self.error_value)
410 }
411
412 /// Returns the value that is associated with `char` in this [`CodePointTrie`].
413 ///
414 /// # Examples
415 ///
416 /// ```
417 /// use icu::collections::codepointtrie::planes;
418 /// let trie = planes::get_planes_trie();
419 ///
420 /// assert_eq!(0, trie.get('A')); // 'A' as u32
421 /// assert_eq!(0, trie.get('Ꮰ')); // 'Ꮰ' as u32
422 /// assert_eq!(1, trie.get('')); // '' as u32
423 /// ```
424 #[inline(always)]
get(&self, c: char) -> T425 pub fn get(&self, c: char) -> T {
426 self.get32(u32::from(c))
427 }
428
429 /// Returns a reference to the ULE of the value that is associated with `code_point` in this [`CodePointTrie`].
430 ///
431 /// # Examples
432 ///
433 /// ```
434 /// use icu::collections::codepointtrie::planes;
435 /// let trie = planes::get_planes_trie();
436 ///
437 /// assert_eq!(Some(&0), trie.get32_ule(0x41)); // 'A' as u32
438 /// assert_eq!(Some(&0), trie.get32_ule(0x13E0)); // 'Ꮰ' as u32
439 /// assert_eq!(Some(&1), trie.get32_ule(0x10044)); // '' as u32
440 /// ```
441 #[inline(always)] // `always` based on normalizer benchmarking
get32_ule(&self, code_point: u32) -> Option<&T::ULE>442 pub fn get32_ule(&self, code_point: u32) -> Option<&T::ULE> {
443 // All code points up to the fast max limit are represented
444 // individually in the `index` array to hold their `data` array position, and
445 // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`).
446 // Code points above the "fast max" limit require 4 lookups.
447 let fast_max = match self.header.trie_type {
448 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
449 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
450 };
451 let data_pos: u32 = if code_point <= fast_max {
452 Self::fast_index(self, code_point)
453 } else if code_point <= CODE_POINT_MAX {
454 Self::small_index(self, code_point)
455 } else {
456 self.trie_error_val_index()
457 };
458 // Returns the trie value (or trie's error value).
459 self.data.as_ule_slice().get(data_pos as usize)
460 }
461
462 /// Converts the [`CodePointTrie`] into one that returns another type of the same size.
463 ///
464 /// Borrowed data remains borrowed, and owned data remains owned.
465 ///
466 /// If the old and new types are not the same size, use
467 /// [`CodePointTrie::try_alloc_map_value`].
468 ///
469 /// # Panics
470 ///
471 /// Panics if `T` and `P` are different sizes.
472 ///
473 /// More specifically, panics if [`ZeroVec::try_into_converted()`] panics when converting
474 /// `ZeroVec<T>` into `ZeroVec<P>`, which happens if `T::ULE` and `P::ULE` differ in size.
475 ///
476 /// # Examples
477 ///
478 /// ```no_run
479 /// use icu::collections::codepointtrie::planes;
480 /// use icu::collections::codepointtrie::CodePointTrie;
481 ///
482 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
483 /// let planes_trie_i8: CodePointTrie<i8> =
484 /// planes_trie_u8.try_into_converted().expect("infallible");
485 ///
486 /// assert_eq!(planes_trie_i8.get32(0x30000), 3);
487 /// ```
488 #[cfg(feature = "alloc")]
try_into_converted<P>(self) -> Result<CodePointTrie<'trie, P>, UleError> where P: TrieValue,489 pub fn try_into_converted<P>(self) -> Result<CodePointTrie<'trie, P>, UleError>
490 where
491 P: TrieValue,
492 {
493 let converted_data = self.data.try_into_converted()?;
494 let error_ule = self.error_value.to_unaligned();
495 let slice = &[error_ule];
496 let error_vec = ZeroVec::<T>::new_borrowed(slice);
497 let error_converted = error_vec.try_into_converted::<P>()?;
498 #[allow(clippy::expect_used)] // we know this cannot fail
499 Ok(CodePointTrie {
500 header: self.header,
501 index: self.index,
502 data: converted_data,
503 error_value: error_converted
504 .get(0)
505 .expect("vector known to have one element"),
506 })
507 }
508
509 /// Maps the [`CodePointTrie`] into one that returns a different type.
510 ///
511 /// This function returns owned data.
512 ///
513 /// If the old and new types are the same size, use the more efficient
514 /// [`CodePointTrie::try_into_converted`].
515 ///
516 /// # Examples
517 ///
518 /// ```
519 /// use icu::collections::codepointtrie::planes;
520 /// use icu::collections::codepointtrie::CodePointTrie;
521 ///
522 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
523 /// let planes_trie_u16: CodePointTrie<u16> = planes_trie_u8
524 /// .try_alloc_map_value(TryFrom::try_from)
525 /// .expect("infallible");
526 ///
527 /// assert_eq!(planes_trie_u16.get32(0x30000), 3);
528 /// ```
529 #[cfg(feature = "alloc")]
try_alloc_map_value<P, E>( &self, mut f: impl FnMut(T) -> Result<P, E>, ) -> Result<CodePointTrie<'trie, P>, E> where P: TrieValue,530 pub fn try_alloc_map_value<P, E>(
531 &self,
532 mut f: impl FnMut(T) -> Result<P, E>,
533 ) -> Result<CodePointTrie<'trie, P>, E>
534 where
535 P: TrieValue,
536 {
537 let error_converted = f(self.error_value)?;
538 let converted_data = self.data.iter().map(f).collect::<Result<ZeroVec<P>, E>>()?;
539 Ok(CodePointTrie {
540 header: self.header,
541 index: self.index.clone(),
542 data: converted_data,
543 error_value: error_converted,
544 })
545 }
546
547 /// Returns a [`CodePointMapRange`] struct which represents a range of code
548 /// points associated with the same trie value. The returned range will be
549 /// the longest stretch of consecutive code points starting at `start` that
550 /// share this value.
551 ///
552 /// This method is designed to use the internal details of
553 /// the structure of [`CodePointTrie`] to be optimally efficient. This will
554 /// outperform a naive approach that just uses [`CodePointTrie::get()`].
555 ///
556 /// This method provides lower-level functionality that can be used in the
557 /// implementation of other methods that are more convenient to the user.
558 /// To obtain an optimal partition of the code point space for
559 /// this trie resulting in the fewest number of ranges, see
560 /// [`CodePointTrie::iter_ranges()`].
561 ///
562 /// # Examples
563 ///
564 /// ```
565 /// use icu::collections::codepointtrie::planes;
566 ///
567 /// let trie = planes::get_planes_trie();
568 ///
569 /// const CODE_POINT_MAX: u32 = 0x10ffff;
570 /// let start = 0x1_0000;
571 /// let exp_end = 0x1_ffff;
572 ///
573 /// let start_val = trie.get32(start);
574 /// assert_eq!(trie.get32(exp_end), start_val);
575 /// assert_ne!(trie.get32(exp_end + 1), start_val);
576 ///
577 /// use icu::collections::codepointtrie::CodePointMapRange;
578 ///
579 /// let cpm_range: CodePointMapRange<u8> = trie.get_range(start).unwrap();
580 ///
581 /// assert_eq!(cpm_range.range.start(), &start);
582 /// assert_eq!(cpm_range.range.end(), &exp_end);
583 /// assert_eq!(cpm_range.value, start_val);
584 ///
585 /// // `start` can be any code point, whether or not it lies on the boundary
586 /// // of a maximally large range that still contains `start`
587 ///
588 /// let submaximal_1_start = start + 0x1234;
589 /// let submaximal_1 = trie.get_range(submaximal_1_start).unwrap();
590 /// assert_eq!(submaximal_1.range.start(), &0x1_1234);
591 /// assert_eq!(submaximal_1.range.end(), &0x1_ffff);
592 /// assert_eq!(submaximal_1.value, start_val);
593 ///
594 /// let submaximal_2_start = start + 0xffff;
595 /// let submaximal_2 = trie.get_range(submaximal_2_start).unwrap();
596 /// assert_eq!(submaximal_2.range.start(), &0x1_ffff);
597 /// assert_eq!(submaximal_2.range.end(), &0x1_ffff);
598 /// assert_eq!(submaximal_2.value, start_val);
599 /// ```
get_range(&self, start: u32) -> Option<CodePointMapRange<T>>600 pub fn get_range(&self, start: u32) -> Option<CodePointMapRange<T>> {
601 // Exit early if the start code point is out of range, or if it is
602 // in the last range of code points in high_start..=CODE_POINT_MAX
603 // (start- and end-inclusive) that all share the same trie value.
604 if CODE_POINT_MAX < start {
605 return None;
606 }
607 if start >= self.header.high_start {
608 let di: usize = self.data.len() - (HIGH_VALUE_NEG_DATA_OFFSET as usize);
609 let value: T = self.data.get(di)?;
610 return Some(CodePointMapRange {
611 range: start..=CODE_POINT_MAX,
612 value,
613 });
614 }
615
616 let null_value: T = T::try_from_u32(self.header.null_value).ok()?;
617
618 let mut prev_i3_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
619 let mut prev_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
620 let mut c: u32 = start;
621 let mut trie_value: T = self.error_value();
622 let mut value: T = self.error_value();
623 let mut have_value: bool = false;
624
625 loop {
626 let i3_block: u32;
627 let mut i3: u32;
628 let i3_block_length: u32;
629 let data_block_length: u32;
630
631 // Initialize values before beginning the iteration in the subsequent
632 // `loop` block. In particular, use the "i3*" local variables
633 // (representing the `index` array position's offset + increment
634 // for a 3rd-level trie lookup) to help initialize the data block
635 // variable `block` in the loop for the `data` array.
636 //
637 // When a lookup code point is <= the trie's *_FAST_INDEXING_MAX that
638 // corresponds to its `trie_type`, the lookup only takes 2 steps
639 // (once into the `index`, once into the `data` array); otherwise,
640 // takes 4 steps (3 iterative lookups into the `index`, once more
641 // into the `data` array). So for convenience's sake, when we have the
642 // 2-stage lookup, reuse the "i3*" variable names for the first lookup.
643 if c <= 0xffff
644 && (self.header.trie_type == TrieType::Fast || c <= SMALL_TYPE_FAST_INDEXING_MAX)
645 {
646 i3_block = 0;
647 i3 = c >> FAST_TYPE_SHIFT;
648 i3_block_length = if self.header.trie_type == TrieType::Fast {
649 BMP_INDEX_LENGTH
650 } else {
651 SMALL_INDEX_LENGTH
652 };
653 data_block_length = FAST_TYPE_DATA_BLOCK_LENGTH;
654 } else {
655 // Use the multi-stage index.
656 let mut i1: u32 = c >> SHIFT_1;
657 if self.header.trie_type == TrieType::Fast {
658 debug_assert!(0xffff < c && c < self.header.high_start);
659 i1 = i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH;
660 } else {
661 debug_assert!(
662 c < self.header.high_start && self.header.high_start > SMALL_LIMIT
663 );
664 i1 += SMALL_INDEX_LENGTH;
665 }
666 let i2: u16 = self.index.get(i1 as usize)?;
667 let i3_block_idx: u32 = (i2 as u32) + ((c >> SHIFT_2) & INDEX_2_MASK);
668 i3_block = if let Some(i3b) = self.index.get(i3_block_idx as usize) {
669 i3b as u32
670 } else {
671 return None;
672 };
673 if i3_block == prev_i3_block && (c - start) >= CP_PER_INDEX_2_ENTRY {
674 // The index-3 block is the same as the previous one, and filled with value.
675 debug_assert!((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0);
676 c += CP_PER_INDEX_2_ENTRY;
677
678 if c >= self.header.high_start {
679 break;
680 } else {
681 continue;
682 }
683 }
684 prev_i3_block = i3_block;
685 if i3_block == self.header.index3_null_offset as u32 {
686 // This is the index-3 null block.
687 // All of the `data` array blocks pointed to by the values
688 // in this block of the `index` 3rd-stage subarray will
689 // contain this trie's null_value. So if we are in the middle
690 // of a range, end it and return early, otherwise start a new
691 // range of null values.
692 if have_value {
693 if null_value != value {
694 return Some(CodePointMapRange {
695 range: start..=(c - 1),
696 value,
697 });
698 }
699 } else {
700 trie_value = T::try_from_u32(self.header.null_value).ok()?;
701 value = null_value;
702 have_value = true;
703 }
704 prev_block = self.header.data_null_offset;
705 c = (c + CP_PER_INDEX_2_ENTRY) & !(CP_PER_INDEX_2_ENTRY - 1);
706
707 if c >= self.header.high_start {
708 break;
709 } else {
710 continue;
711 }
712 }
713 i3 = (c >> SHIFT_3) & INDEX_3_MASK;
714 i3_block_length = INDEX_3_BLOCK_LENGTH;
715 data_block_length = SMALL_DATA_BLOCK_LENGTH;
716 }
717
718 // Enumerate data blocks for one index-3 block.
719 loop {
720 let mut block: u32;
721 if (i3_block & 0x8000) == 0 {
722 block = if let Some(b) = self.index.get((i3_block + i3) as usize) {
723 b as u32
724 } else {
725 return None;
726 };
727 } else {
728 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
729 let mut group: u32 = (i3_block & 0x7fff) + (i3 & !7) + (i3 >> 3);
730 let gi: u32 = i3 & 7;
731 let gi_val: u32 = if let Some(giv) = self.index.get(group as usize) {
732 giv.into()
733 } else {
734 return None;
735 };
736 block = (gi_val << (2 + (2 * gi))) & 0x30000;
737 group += 1;
738 let ggi_val: u32 = if let Some(ggiv) = self.index.get((group + gi) as usize) {
739 ggiv as u32
740 } else {
741 return None;
742 };
743 block |= ggi_val;
744 }
745
746 // If our previous and current return values of the 3rd-stage `index`
747 // lookup yield the same `data` block offset, and if we already know that
748 // the entire `data` block / subarray starting at that offset stores
749 // `value` and nothing else, then we can extend our range by the length
750 // of a data block and continue.
751 // Otherwise, we have to iterate over the values stored in the
752 // new data block to see if they differ from `value`.
753 if block == prev_block && (c - start) >= data_block_length {
754 // The block is the same as the previous one, and filled with value.
755 debug_assert!((c & (data_block_length - 1)) == 0);
756 c += data_block_length;
757 } else {
758 let data_mask: u32 = data_block_length - 1;
759 prev_block = block;
760 if block == self.header.data_null_offset {
761 // This is the data null block.
762 // If we are in the middle of a range, end it and
763 // return early, otherwise start a new range of null
764 // values.
765 if have_value {
766 if null_value != value {
767 return Some(CodePointMapRange {
768 range: start..=(c - 1),
769 value,
770 });
771 }
772 } else {
773 trie_value = T::try_from_u32(self.header.null_value).ok()?;
774 value = null_value;
775 have_value = true;
776 }
777 c = (c + data_block_length) & !data_mask;
778 } else {
779 let mut di: u32 = block + (c & data_mask);
780 let mut trie_value_2: T = self.data.get(di as usize)?;
781 if have_value {
782 if trie_value_2 != trie_value {
783 if maybe_filter_value(
784 trie_value_2,
785 T::try_from_u32(self.header.null_value).ok()?,
786 null_value,
787 ) != value
788 {
789 return Some(CodePointMapRange {
790 range: start..=(c - 1),
791 value,
792 });
793 }
794 // `trie_value` stores the previous value that was retrieved
795 // from the trie.
796 // `value` stores the value associated for the range (return
797 // value) that we are currently building, which is computed
798 // as a transformation by applying maybe_filter_value()
799 // to the trie value.
800 // The current trie value `trie_value_2` within this data block
801 // differs here from the previous value in `trie_value`.
802 // But both map to `value` after applying `maybe_filter_value`.
803 // It is not clear whether the previous or the current trie value
804 // (or neither) is more likely to match potential subsequent trie
805 // values that would extend the range by mapping to `value`.
806 // On the assumption of locality -- often times consecutive
807 // characters map to the same trie values -- remembering the new
808 // one might make it faster to extend this range further
809 // (by increasing the chance that the next `trie_value_2 !=
810 // trie_value` test will be false).
811 trie_value = trie_value_2; // may or may not help
812 }
813 } else {
814 trie_value = trie_value_2;
815 value = maybe_filter_value(
816 trie_value_2,
817 T::try_from_u32(self.header.null_value).ok()?,
818 null_value,
819 );
820 have_value = true;
821 }
822
823 c += 1;
824 while (c & data_mask) != 0 {
825 di += 1;
826 trie_value_2 = self.data.get(di as usize)?;
827 if trie_value_2 != trie_value {
828 if maybe_filter_value(
829 trie_value_2,
830 T::try_from_u32(self.header.null_value).ok()?,
831 null_value,
832 ) != value
833 {
834 return Some(CodePointMapRange {
835 range: start..=(c - 1),
836 value,
837 });
838 }
839 // `trie_value` stores the previous value that was retrieved
840 // from the trie.
841 // `value` stores the value associated for the range (return
842 // value) that we are currently building, which is computed
843 // as a transformation by applying maybe_filter_value()
844 // to the trie value.
845 // The current trie value `trie_value_2` within this data block
846 // differs here from the previous value in `trie_value`.
847 // But both map to `value` after applying `maybe_filter_value`.
848 // It is not clear whether the previous or the current trie value
849 // (or neither) is more likely to match potential subsequent trie
850 // values that would extend the range by mapping to `value`.
851 // On the assumption of locality -- often times consecutive
852 // characters map to the same trie values -- remembering the new
853 // one might make it faster to extend this range further
854 // (by increasing the chance that the next `trie_value_2 !=
855 // trie_value` test will be false).
856 trie_value = trie_value_2; // may or may not help
857 }
858
859 c += 1;
860 }
861 }
862 }
863
864 i3 += 1;
865 if i3 >= i3_block_length {
866 break;
867 }
868 }
869
870 if c >= self.header.high_start {
871 break;
872 }
873 }
874
875 debug_assert!(have_value);
876
877 // Now that c >= high_start, compare `value` to `high_value` to see
878 // if we can merge our current range with the high_value range
879 // high_start..=CODE_POINT_MAX (start- and end-inclusive), otherwise
880 // stop at high_start - 1.
881 let di: u32 = self.data.len() as u32 - HIGH_VALUE_NEG_DATA_OFFSET;
882 let high_value: T = self.data.get(di as usize)?;
883 if maybe_filter_value(
884 high_value,
885 T::try_from_u32(self.header.null_value).ok()?,
886 null_value,
887 ) != value
888 {
889 c -= 1;
890 } else {
891 c = CODE_POINT_MAX;
892 }
893 Some(CodePointMapRange {
894 range: start..=c,
895 value,
896 })
897 }
898
899 /// Yields an [`Iterator`] returning ranges of consecutive code points that
900 /// share the same value in the [`CodePointTrie`], as given by
901 /// [`CodePointTrie::get_range()`].
902 ///
903 /// # Examples
904 ///
905 /// ```
906 /// use core::ops::RangeInclusive;
907 /// use icu::collections::codepointtrie::planes;
908 /// use icu::collections::codepointtrie::CodePointMapRange;
909 ///
910 /// let planes_trie = planes::get_planes_trie();
911 ///
912 /// let mut ranges = planes_trie.iter_ranges();
913 ///
914 /// for plane in 0..=16 {
915 /// let exp_start = plane * 0x1_0000;
916 /// let exp_end = exp_start + 0xffff;
917 /// assert_eq!(
918 /// ranges.next(),
919 /// Some(CodePointMapRange {
920 /// range: exp_start..=exp_end,
921 /// value: plane as u8
922 /// })
923 /// );
924 /// }
925 ///
926 /// // Hitting the end of the iterator returns `None`, as will subsequent
927 /// // calls to .next().
928 /// assert_eq!(ranges.next(), None);
929 /// assert_eq!(ranges.next(), None);
930 /// ```
iter_ranges(&self) -> CodePointMapRangeIterator<T>931 pub fn iter_ranges(&self) -> CodePointMapRangeIterator<T> {
932 let init_range = Some(CodePointMapRange {
933 range: u32::MAX..=u32::MAX,
934 value: self.error_value(),
935 });
936 CodePointMapRangeIterator::<T> {
937 cpt: self,
938 cpm_range: init_range,
939 }
940 }
941
942 /// Yields an [`Iterator`] returning the ranges of the code points whose values
943 /// match `value` in the [`CodePointTrie`].
944 ///
945 /// # Examples
946 ///
947 /// ```
948 /// use icu::collections::codepointtrie::planes;
949 ///
950 /// let trie = planes::get_planes_trie();
951 ///
952 /// let plane_val = 2;
953 /// let mut sip_range_iter = trie.iter_ranges_for_value(plane_val as u8);
954 ///
955 /// let start = plane_val * 0x1_0000;
956 /// let end = start + 0xffff;
957 ///
958 /// let sip_range = sip_range_iter.next()
959 /// .expect("Plane 2 (SIP) should exist in planes data");
960 /// assert_eq!(start..=end, sip_range);
961 ///
962 /// assert!(sip_range_iter.next().is_none());
iter_ranges_for_value( &self, value: T, ) -> impl Iterator<Item = RangeInclusive<u32>> + '_963 pub fn iter_ranges_for_value(
964 &self,
965 value: T,
966 ) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
967 self.iter_ranges()
968 .filter(move |cpm_range| cpm_range.value == value)
969 .map(|cpm_range| cpm_range.range)
970 }
971
972 /// Yields an [`Iterator`] returning the ranges of the code points after passing
973 /// the value through a mapping function.
974 ///
975 /// This is preferable to calling `.get_ranges().map()` since it will coalesce
976 /// adjacent ranges into one.
977 ///
978 /// # Examples
979 ///
980 /// ```
981 /// use icu::collections::codepointtrie::planes;
982 ///
983 /// let trie = planes::get_planes_trie();
984 ///
985 /// let plane_val = 2;
986 /// let mut sip_range_iter = trie.iter_ranges_mapped(|value| value != plane_val as u8).filter(|range| range.value);
987 ///
988 /// let end = plane_val * 0x1_0000 - 1;
989 ///
990 /// let sip_range = sip_range_iter.next()
991 /// .expect("Complemented planes data should have at least one entry");
992 /// assert_eq!(0..=end, sip_range.range);
iter_ranges_mapped<'a, U: Eq + 'a>( &'a self, mut map: impl FnMut(T) -> U + Copy + 'a, ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a993 pub fn iter_ranges_mapped<'a, U: Eq + 'a>(
994 &'a self,
995 mut map: impl FnMut(T) -> U + Copy + 'a,
996 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
997 crate::iterator_utils::RangeListIteratorCoalescer::new(self.iter_ranges().map(
998 move |range| CodePointMapRange {
999 range: range.range,
1000 value: map(range.value),
1001 },
1002 ))
1003 }
1004
1005 /// Returns a [`CodePointInversionList`] for the code points that have the given
1006 /// [`TrieValue`] in the trie.
1007 ///
1008 /// # Examples
1009 ///
1010 /// ```
1011 /// use icu::collections::codepointtrie::planes;
1012 ///
1013 /// let trie = planes::get_planes_trie();
1014 ///
1015 /// let plane_val = 2;
1016 /// let sip = trie.get_set_for_value(plane_val as u8);
1017 ///
1018 /// let start = plane_val * 0x1_0000;
1019 /// let end = start + 0xffff;
1020 ///
1021 /// assert!(!sip.contains32(start - 1));
1022 /// assert!(sip.contains32(start));
1023 /// assert!(sip.contains32(end));
1024 /// assert!(!sip.contains32(end + 1));
1025 /// ```
1026 #[cfg(feature = "alloc")]
get_set_for_value(&self, value: T) -> CodePointInversionList<'static>1027 pub fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
1028 let value_ranges = self.iter_ranges_for_value(value);
1029 CodePointInversionList::from_iter(value_ranges)
1030 }
1031
1032 /// Returns the value used as an error value for this trie
1033 #[inline]
error_value(&self) -> T1034 pub fn error_value(&self) -> T {
1035 self.error_value
1036 }
1037 }
1038
1039 #[cfg(feature = "databake")]
1040 impl<T: TrieValue + databake::Bake> databake::Bake for CodePointTrie<'_, T> {
bake(&self, env: &databake::CrateEnv) -> databake::TokenStream1041 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
1042 let header = self.header.bake(env);
1043 let index = self.index.bake(env);
1044 let data = self.data.bake(env);
1045 let error_value = self.error_value.bake(env);
1046 databake::quote! { icu_collections::codepointtrie::CodePointTrie::from_parts(#header, #index, #data, #error_value) }
1047 }
1048 }
1049
1050 #[cfg(feature = "databake")]
1051 impl<T: TrieValue + databake::Bake> databake::BakeSize for CodePointTrie<'_, T> {
borrows_size(&self) -> usize1052 fn borrows_size(&self) -> usize {
1053 self.header.borrows_size() + self.index.borrows_size() + self.data.borrows_size()
1054 }
1055 }
1056
1057 impl<T: TrieValue + Into<u32>> CodePointTrie<'_, T> {
1058 /// Returns the value that is associated with `code_point` for this [`CodePointTrie`]
1059 /// as a `u32`.
1060 ///
1061 /// # Examples
1062 ///
1063 /// ```
1064 /// use icu::collections::codepointtrie::planes;
1065 /// let trie = planes::get_planes_trie();
1066 ///
1067 /// let cp = '' as u32;
1068 /// assert_eq!(cp, 0x1158E);
1069 ///
1070 /// let plane_num: u8 = trie.get32(cp);
1071 /// assert_eq!(trie.get32_u32(cp), plane_num as u32);
1072 /// ```
1073 // Note: This API method maintains consistency with the corresponding
1074 // original ICU APIs.
get32_u32(&self, code_point: u32) -> u321075 pub fn get32_u32(&self, code_point: u32) -> u32 {
1076 self.get32(code_point).into()
1077 }
1078 }
1079
1080 impl<T: TrieValue> Clone for CodePointTrie<'_, T>
1081 where
1082 <T as zerovec::ule::AsULE>::ULE: Clone,
1083 {
clone(&self) -> Self1084 fn clone(&self) -> Self {
1085 CodePointTrie {
1086 header: self.header,
1087 index: self.index.clone(),
1088 data: self.data.clone(),
1089 error_value: self.error_value,
1090 }
1091 }
1092 }
1093
1094 /// Represents a range of consecutive code points sharing the same value in a
1095 /// code point map.
1096 ///
1097 /// The start and end of the interval is represented as a
1098 /// `RangeInclusive<u32>`, and the value is represented as `T`.
1099 #[derive(PartialEq, Eq, Debug, Clone)]
1100 pub struct CodePointMapRange<T> {
1101 /// Range of code points from start to end (inclusive).
1102 pub range: RangeInclusive<u32>,
1103 /// Trie value associated with this range.
1104 pub value: T,
1105 }
1106
1107 /// A custom [`Iterator`] type specifically for a code point trie that returns
1108 /// [`CodePointMapRange`]s.
1109 pub struct CodePointMapRangeIterator<'a, T: TrieValue> {
1110 cpt: &'a CodePointTrie<'a, T>,
1111 // Initialize `range` to Some(CodePointMapRange{ start: u32::MAX, end: u32::MAX, value: 0}).
1112 // When `range` is Some(...) and has a start value different from u32::MAX, then we have
1113 // returned at least one code point range due to a call to `next()`.
1114 // When `range` == `None`, it means that we have hit the end of iteration. It would occur
1115 // after a call to `next()` returns a None <=> we attempted to call `get_range()`
1116 // with a start code point that is > CODE_POINT_MAX.
1117 cpm_range: Option<CodePointMapRange<T>>,
1118 }
1119
1120 impl<T: TrieValue> Iterator for CodePointMapRangeIterator<'_, T> {
1121 type Item = CodePointMapRange<T>;
1122
next(&mut self) -> Option<Self::Item>1123 fn next(&mut self) -> Option<Self::Item> {
1124 self.cpm_range = match &self.cpm_range {
1125 Some(cpmr) => {
1126 if *cpmr.range.start() == u32::MAX {
1127 self.cpt.get_range(0)
1128 } else {
1129 self.cpt.get_range(cpmr.range.end() + 1)
1130 }
1131 }
1132 None => None,
1133 };
1134 // Note: Clone is cheap. We can't Copy because RangeInclusive does not impl Copy.
1135 self.cpm_range.clone()
1136 }
1137 }
1138
1139 #[cfg(test)]
1140 mod tests {
1141 use super::*;
1142 use crate::codepointtrie::planes;
1143 use alloc::vec::Vec;
1144
1145 #[test]
1146 #[cfg(feature = "serde")]
test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error>1147 fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1148 let trie = crate::codepointtrie::planes::get_planes_trie();
1149 let trie_serialized: Vec<u8> = postcard::to_allocvec(&trie).unwrap();
1150
1151 // Assert an expected (golden data) version of the serialized trie.
1152 const EXP_TRIE_SERIALIZED: &[u8] = &[
1153 128, 128, 64, 128, 2, 2, 0, 0, 1, 160, 18, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1154 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 136,
1158 2, 144, 2, 144, 2, 144, 2, 176, 2, 176, 2, 176, 2, 176, 2, 208, 2, 208, 2, 208, 2, 208,
1159 2, 240, 2, 240, 2, 240, 2, 240, 2, 16, 3, 16, 3, 16, 3, 16, 3, 48, 3, 48, 3, 48, 3, 48,
1160 3, 80, 3, 80, 3, 80, 3, 80, 3, 112, 3, 112, 3, 112, 3, 112, 3, 144, 3, 144, 3, 144, 3,
1161 144, 3, 176, 3, 176, 3, 176, 3, 176, 3, 208, 3, 208, 3, 208, 3, 208, 3, 240, 3, 240, 3,
1162 240, 3, 240, 3, 16, 4, 16, 4, 16, 4, 16, 4, 48, 4, 48, 4, 48, 4, 48, 4, 80, 4, 80, 4,
1163 80, 4, 80, 4, 112, 4, 112, 4, 112, 4, 112, 4, 0, 0, 16, 0, 32, 0, 48, 0, 64, 0, 80, 0,
1164 96, 0, 112, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32,
1165 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48,
1166 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 128, 0, 128, 0, 128, 0, 128,
1167 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1168 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1169 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1170 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1171 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1172 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1173 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1174 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1175 0, 160, 0, 160, 0, 160, 0, 160, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1176 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1177 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1178 0, 176, 0, 176, 0, 176, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1179 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1180 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1181 0, 192, 0, 192, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1182 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1183 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1184 0, 208, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1185 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1186 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1187 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1188 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1189 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 0,
1190 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
1191 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1192 1, 0, 1, 0, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1,
1193 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16,
1194 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 32, 1, 32, 1, 32, 1,
1195 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32,
1196 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1,
1197 32, 1, 32, 1, 32, 1, 32, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48,
1198 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1,
1199 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 64, 1, 64,
1200 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1,
1201 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64,
1202 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1203 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80,
1204 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1205 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96,
1206 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1,
1207 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 128, 0, 136, 0, 136, 0, 136, 0, 136,
1208 0, 136, 0, 136, 0, 136, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
1209 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
1210 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1211 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1212 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1213 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1214 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1215 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1216 200, 0, 200, 0, 200, 0, 200, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1217 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1218 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1219 232, 0, 232, 0, 232, 0, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8,
1220 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1,
1221 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1222 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1,
1223 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1224 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1,
1225 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72,
1226 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1227 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1228 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1229 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1230 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1231 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1232 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1233 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1234 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1235 168, 1, 168, 1, 168, 1, 168, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1236 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1237 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1238 200, 1, 200, 1, 200, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1239 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1240 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1241 232, 1, 232, 1, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2,
1242 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8,
1243 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40,
1244 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2,
1245 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 72,
1246 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2,
1247 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72,
1248 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1249 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1250 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1251 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 244, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1256 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1257 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
1258 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
1259 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
1260 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10,
1261 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
1262 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
1263 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
1264 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1265 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 0,
1266 ];
1267 assert_eq!(trie_serialized, EXP_TRIE_SERIALIZED);
1268
1269 let trie_deserialized = postcard::from_bytes::<CodePointTrie<u8>>(&trie_serialized)?;
1270
1271 assert_eq!(&trie.index, &trie_deserialized.index);
1272 assert_eq!(&trie.data, &trie_deserialized.data);
1273
1274 assert!(!trie_deserialized.index.is_owned());
1275 assert!(!trie_deserialized.data.is_owned());
1276
1277 Ok(())
1278 }
1279
1280 #[test]
test_get_range()1281 fn test_get_range() {
1282 let planes_trie = planes::get_planes_trie();
1283
1284 let first_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x0);
1285 assert_eq!(
1286 first_range,
1287 Some(CodePointMapRange {
1288 range: 0x0..=0xffff,
1289 value: 0
1290 })
1291 );
1292
1293 let second_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x1_0000);
1294 assert_eq!(
1295 second_range,
1296 Some(CodePointMapRange {
1297 range: 0x10000..=0x1ffff,
1298 value: 1
1299 })
1300 );
1301
1302 let penultimate_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0xf_0000);
1303 assert_eq!(
1304 penultimate_range,
1305 Some(CodePointMapRange {
1306 range: 0xf_0000..=0xf_ffff,
1307 value: 15
1308 })
1309 );
1310
1311 let last_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x10_0000);
1312 assert_eq!(
1313 last_range,
1314 Some(CodePointMapRange {
1315 range: 0x10_0000..=0x10_ffff,
1316 value: 16
1317 })
1318 );
1319 }
1320
1321 #[test]
databake()1322 fn databake() {
1323 databake::test_bake!(
1324 CodePointTrie<'static, u32>,
1325 const,
1326 crate::codepointtrie::CodePointTrie::from_parts(
1327 crate::codepointtrie::CodePointTrieHeader {
1328 high_start: 1u32,
1329 shifted12_high_start: 2u16,
1330 index3_null_offset: 3u16,
1331 data_null_offset: 4u32,
1332 null_value: 5u32,
1333 trie_type: crate::codepointtrie::TrieType::Small,
1334 },
1335 zerovec::ZeroVec::new(),
1336 zerovec::ZeroVec::new(),
1337 0u32,
1338 ),
1339 icu_collections,
1340 [zerovec],
1341 );
1342 }
1343 }
1344