• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use zerofrom::ZeroFrom;
6 use zerovec::{ZeroSlice, ZeroVec};
7 
8 // Match-node lead unit values, after masking off intermediate-value bits:
9 
10 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
11 // the length is one more than the next byte.
12 
13 // For a branch sub-node with at most this many entries, we drop down
14 // to a linear search.
15 const MAX_BRANCH_LINEAR_SUB_NODE_LENGTH: usize = 5;
16 
17 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
18 const MIN_LINEAR_MATCH: u16 = 0x30;
19 const MAX_LINEAR_MATCH_LENGTH: u16 = 0x10;
20 
21 // Match-node lead unit bits 14..6 for the optional intermediate value.
22 // If these bits are 0, then there is no intermediate value.
23 // Otherwise, see the *NodeValue* constants below.
24 const MIN_VALUE_LEAD: u16 = MIN_LINEAR_MATCH + MAX_LINEAR_MATCH_LENGTH; // 0x40
25 const NODE_TYPE_MASK: u16 = MIN_VALUE_LEAD - 1; // 0x003f
26 
27 // A final-value node has bit 15 set.
28 const VALUE_IS_FINAL: u16 = 0x8000;
29 
30 // Compact value: After testing bit 0, shift right by 15 and then use the following thresholds.
31 const MAX_ONE_UNIT_VALUE: u16 = 0x3fff;
32 
33 const MIN_TWO_UNIT_VALUE_LEAD: u16 = MAX_ONE_UNIT_VALUE + 1; // 0x4000
34 
35 const MAX_ONE_UNIT_NODE_VALUE: u16 = 0xff;
36 
37 const MIN_TWO_UNIT_NODE_VALUE_LEAD: u16 = MIN_VALUE_LEAD + ((MAX_ONE_UNIT_NODE_VALUE + 1) << 6); // 0x4040
38 
39 const THREE_UNIT_NODE_VALUE_LEAD: u16 = 0x7fc0;
40 
41 const THREE_UNIT_VALUE_LEAD: u16 = 0x7fff;
42 
43 // Compact delta integers.
44 const MAX_ONE_UNIT_DELTA: u16 = 0xfbff;
45 const MIN_TWO_UNIT_DELTA_LEAD: u16 = MAX_ONE_UNIT_DELTA + 1; // 0xfc00
46 const THREE_UNIT_DELTA_LEAD: u16 = 0xffff;
47 
skip_value(pos: usize, lead: u16) -> usize48 fn skip_value(pos: usize, lead: u16) -> usize {
49     if lead < MIN_TWO_UNIT_VALUE_LEAD {
50         pos
51     } else if lead < THREE_UNIT_VALUE_LEAD {
52         pos + 1
53     } else {
54         pos + 2
55     }
56 }
57 
skip_node_value(pos: usize, lead: u16) -> usize58 fn skip_node_value(pos: usize, lead: u16) -> usize {
59     if lead < MIN_TWO_UNIT_NODE_VALUE_LEAD {
60         pos
61     } else if lead < THREE_UNIT_NODE_VALUE_LEAD {
62         pos + 1
63     } else {
64         pos + 2
65     }
66 }
67 
68 /// This struct represents a de-serialized `Char16Trie` that was exported from
69 /// ICU binary data.
70 ///
71 /// Light-weight, non-const reader class for a `CharsTrie`. Traverses a
72 /// char-serialized data structure with minimal state, for mapping 16-bit-unit
73 /// sequences to non-negative integer values.
74 ///
75 /// For more information:
76 /// - [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
77 /// - [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
78 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
79 #[cfg_attr(feature = "databake", derive(databake::Bake))]
80 #[cfg_attr(feature = "databake", databake(path = icu_collections::char16trie))]
81 #[derive(Clone, Debug, PartialEq, Eq, ZeroFrom)]
82 pub struct Char16Trie<'data> {
83     /// An array of u16 containing the trie data.
84     #[cfg_attr(feature = "serde", serde(borrow))]
85     #[doc(hidden)] // #2417
86     pub data: ZeroVec<'data, u16>,
87 }
88 
89 impl<'data> Char16Trie<'data> {
90     /// Returns a new [`Char16Trie`] with ownership of the provided data.
new(data: ZeroVec<'data, u16>) -> Self91     pub fn new(data: ZeroVec<'data, u16>) -> Self {
92         Self { data }
93     }
94 
95     /// Returns a new [`Char16TrieIterator`] backed by borrowed data from the `trie` data
iter(&self) -> Char16TrieIterator96     pub fn iter(&self) -> Char16TrieIterator {
97         Char16TrieIterator::new(&self.data)
98     }
99 }
100 
101 /// This struct represents an iterator over a [`Char16Trie`].
102 #[derive(Clone)]
103 pub struct Char16TrieIterator<'a> {
104     /// A reference to the Char16Trie data to iterate over.
105     trie: &'a ZeroSlice<u16>,
106     /// Index of next trie unit to read, or `None` if there are no more matches.
107     pos: Option<usize>,
108     /// Remaining length of a linear-match node, minus 1, or `None` if not in
109     /// such a node.
110     remaining_match_length: Option<usize>,
111 }
112 
113 /// An enum representing the return value from a lookup in [`Char16Trie`].
114 #[derive(Clone, Copy, Debug, PartialEq)]
115 pub enum TrieResult {
116     /// The input unit(s) did not continue a matching string.
117     /// Once `next()` returns `TrieResult::NoMatch`, all further calls to `next()`
118     /// will also return `TrieResult::NoMatch`.
119     NoMatch,
120     /// The input unit(s) matched a string but there is no value for the string
121     /// so far.  (It is a prefix of a longer string.)
122     NoValue,
123     /// The input unit(s) continued a matching string and there is a value for
124     /// the string so far. No further input byte/unit can continue a matching
125     /// string.
126     FinalValue(i32),
127     /// The input unit(s) continued a matching string and there is a value for
128     /// the string so far.  Another input byte/unit can continue a matching
129     /// string.
130     Intermediate(i32),
131 }
132 
133 // Get the lead surrogate (0xd800..0xdbff) for a
134 // supplementary code point (0x10000..0x10ffff).
135 // @param supplementary 32-bit code point (U+10000..U+10ffff)
136 // @return lead surrogate (U+d800..U+dbff) for supplementary
u16_lead(supplementary: i32) -> u16137 fn u16_lead(supplementary: i32) -> u16 {
138     (((supplementary) >> 10) + 0xd7c0) as u16
139 }
140 
141 // Get the trail surrogate (0xdc00..0xdfff) for a
142 // supplementary code point (0x10000..0x10ffff).
143 // @param supplementary 32-bit code point (U+10000..U+10ffff)
144 // @return trail surrogate (U+dc00..U+dfff) for supplementary
u16_tail(supplementary: i32) -> u16145 fn u16_tail(supplementary: i32) -> u16 {
146     (((supplementary) & 0x3ff) | 0xdc00) as u16
147 }
148 
149 /// A macro that takes an `Option` argument and either unwraps it if it has a value or
150 /// causes the function to return `TrieResult::NoMatch` if there is no value.
151 /// This could perhaps be done with `std::ops::Try` once stabilized.
152 macro_rules! trie_unwrap {
153     ($option:expr) => {
154         match $option {
155             Some(x) => x,
156             None => {
157                 // Unexpected
158                 debug_assert!(false);
159                 return TrieResult::NoMatch;
160             }
161         }
162     };
163 }
164 
165 impl<'a> Char16TrieIterator<'a> {
166     /// Returns a new [`Char16TrieIterator`] backed by borrowed data for the `trie` array
new(trie: &'a ZeroSlice<u16>) -> Self167     pub fn new(trie: &'a ZeroSlice<u16>) -> Self {
168         Self {
169             trie,
170             pos: Some(0),
171             remaining_match_length: None,
172         }
173     }
174 
175     /// Traverses the trie from the current state for this input char.
176     ///
177     /// # Examples
178     ///
179     /// ```
180     /// use icu::collections::char16trie::{Char16Trie, TrieResult};
181     /// use zerovec::ZeroVec;
182     ///
183     /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
184     /// let trie_data = [48, 97, 176, 98, 32868];
185     /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
186     ///
187     /// let mut iter = trie.iter();
188     /// let res = iter.next('a');
189     /// assert_eq!(res, TrieResult::Intermediate(1));
190     /// let res = iter.next('b');
191     /// assert_eq!(res, TrieResult::FinalValue(100));
192     /// let res = iter.next('c');
193     /// assert_eq!(res, TrieResult::NoMatch);
194     /// ```
next(&mut self, c: char) -> TrieResult195     pub fn next(&mut self, c: char) -> TrieResult {
196         if (c as u32) <= 0xffff {
197             self.next16(c as u16)
198         } else {
199             match self.next16(u16_lead(c as i32)) {
200                 TrieResult::NoValue | TrieResult::Intermediate(_) => {
201                     self.next16(u16_tail(c as i32))
202                 }
203                 _ => TrieResult::NoMatch,
204             }
205         }
206     }
207 
208     /// Traverses the trie from the current state for this input char.
209     ///
210     /// # Examples
211     ///
212     /// ```
213     /// use icu::collections::char16trie::{Char16Trie, TrieResult};
214     /// use zerovec::ZeroVec;
215     ///
216     /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
217     /// let trie_data = [48, 97, 176, 98, 32868];
218     /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
219     ///
220     /// let mut iter = trie.iter();
221     /// let res = iter.next('a');
222     /// assert_eq!(res, TrieResult::Intermediate(1));
223     /// let res = iter.next('b');
224     /// assert_eq!(res, TrieResult::FinalValue(100));
225     /// let res = iter.next('c');
226     /// assert_eq!(res, TrieResult::NoMatch);
227     /// ```
next32(&mut self, c: u32) -> TrieResult228     pub fn next32(&mut self, c: u32) -> TrieResult {
229         if c <= 0xffff {
230             self.next16(c as u16)
231         } else {
232             match self.next16(u16_lead(c as i32)) {
233                 TrieResult::NoValue | TrieResult::Intermediate(_) => {
234                     self.next16(u16_tail(c as i32))
235                 }
236                 _ => TrieResult::NoMatch,
237             }
238         }
239     }
240 
241     /// Traverses the trie from the current state for this input char.
242     ///
243     /// # Examples
244     ///
245     /// ```
246     /// use icu::collections::char16trie::{Char16Trie, TrieResult};
247     /// use zerovec::ZeroVec;
248     ///
249     /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
250     /// let trie_data = [48, 97, 176, 98, 32868];
251     /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
252     ///
253     /// let mut iter = trie.iter();
254     /// let res = iter.next16('a' as u16);
255     /// assert_eq!(res, TrieResult::Intermediate(1));
256     /// let res = iter.next16('b' as u16);
257     /// assert_eq!(res, TrieResult::FinalValue(100));
258     /// let res = iter.next16('c' as u16);
259     /// assert_eq!(res, TrieResult::NoMatch);
260     /// ```
next16(&mut self, c: u16) -> TrieResult261     pub fn next16(&mut self, c: u16) -> TrieResult {
262         let mut pos = match self.pos {
263             Some(p) => p,
264             None => return TrieResult::NoMatch,
265         };
266         if let Some(length) = self.remaining_match_length {
267             // Remaining part of a linear-match node
268             if c == trie_unwrap!(self.trie.get(pos)) {
269                 pos += 1;
270                 self.pos = Some(pos);
271                 if length == 0 {
272                     self.remaining_match_length = None;
273                     let node = trie_unwrap!(self.trie.get(pos));
274                     if node >= MIN_VALUE_LEAD {
275                         return self.value_result(pos);
276                     }
277                 } else {
278                     self.remaining_match_length = Some(length - 1);
279                 }
280                 return TrieResult::NoValue;
281             }
282             self.stop();
283             TrieResult::NoMatch
284         } else {
285             self.next_impl(pos, c)
286         }
287     }
288 
branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult289     fn branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult {
290         let mut pos = pos;
291         let mut length = length;
292         if length == 0 {
293             length = trie_unwrap!(self.trie.get(pos)) as usize;
294             pos += 1;
295         }
296         length += 1;
297 
298         // The length of the branch is the number of units to select from.
299         // The data structure encodes a binary search.
300         while length > MAX_BRANCH_LINEAR_SUB_NODE_LENGTH {
301             if in_unit < trie_unwrap!(self.trie.get(pos)) {
302                 length >>= 1;
303                 pos = trie_unwrap!(self.jump_by_delta(pos + 1));
304             } else {
305                 length = length - (length >> 1);
306                 pos = trie_unwrap!(self.skip_delta(pos + 1));
307             }
308         }
309         // Drop down to linear search for the last few bytes.
310         // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
311         // and divides length by 2.
312         loop {
313             if in_unit == trie_unwrap!(self.trie.get(pos)) {
314                 pos += 1;
315                 let mut node = trie_unwrap!(self.trie.get(pos));
316                 if node & VALUE_IS_FINAL != 0 {
317                     self.pos = Some(pos);
318                     return self.value_result(pos);
319                 }
320                 // Use the non-final value as the jump delta.
321                 pos += 1;
322 
323                 if node < MIN_TWO_UNIT_VALUE_LEAD {
324                     pos += node as usize;
325                 } else if node < THREE_UNIT_VALUE_LEAD {
326                     pos += (((node - MIN_TWO_UNIT_VALUE_LEAD) as u32) << 16) as usize
327                         | trie_unwrap!(self.trie.get(pos)) as usize;
328                     pos += 1;
329                 } else {
330                     pos += ((trie_unwrap!(self.trie.get(pos)) as usize) << 16)
331                         | trie_unwrap!(self.trie.get(pos + 1)) as usize;
332                     pos += 2;
333                 }
334                 node = trie_unwrap!(self.trie.get(pos));
335                 self.pos = Some(pos);
336 
337                 if node >= MIN_VALUE_LEAD {
338                     return self.value_result(pos);
339                 }
340                 return TrieResult::NoValue;
341             }
342             length -= 1;
343             pos = trie_unwrap!(self.skip_value(pos + 1));
344             if length <= 1 {
345                 break;
346             }
347         }
348 
349         if in_unit == trie_unwrap!(self.trie.get(pos)) {
350             pos += 1;
351             self.pos = Some(pos);
352             let node = trie_unwrap!(self.trie.get(pos));
353             if node >= MIN_VALUE_LEAD {
354                 return self.value_result(pos);
355             }
356             TrieResult::NoValue
357         } else {
358             self.stop();
359             TrieResult::NoMatch
360         }
361     }
362 
next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult363     fn next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult {
364         let mut node = trie_unwrap!(self.trie.get(pos));
365         let mut pos = pos + 1;
366         loop {
367             if node < MIN_LINEAR_MATCH {
368                 return self.branch_next(pos, node as usize, in_unit);
369             } else if node < MIN_VALUE_LEAD {
370                 // Match the first of length+1 units.
371                 let length = node - MIN_LINEAR_MATCH;
372                 if in_unit == trie_unwrap!(self.trie.get(pos)) {
373                     pos += 1;
374                     if length == 0 {
375                         self.remaining_match_length = None;
376                         self.pos = Some(pos);
377                         node = trie_unwrap!(self.trie.get(pos));
378                         if node >= MIN_VALUE_LEAD {
379                             return self.value_result(pos);
380                         }
381                         return TrieResult::NoValue;
382                     }
383                     self.remaining_match_length = Some(length as usize - 1);
384                     self.pos = Some(pos);
385                     return TrieResult::NoValue;
386                 }
387                 // No match
388                 break;
389             } else if (node & VALUE_IS_FINAL) != 0 {
390                 // No further matching units.
391                 break;
392             } else {
393                 // Skip intermediate value.
394                 pos = skip_node_value(pos, node);
395                 node &= NODE_TYPE_MASK;
396             }
397         }
398         self.stop();
399         TrieResult::NoMatch
400     }
401 
stop(&mut self)402     fn stop(&mut self) {
403         self.pos = None;
404     }
405 
406     #[inline(always)] // 1 call site and we want the Option to go away
jump_by_delta(&self, pos: usize) -> Option<usize>407     fn jump_by_delta(&self, pos: usize) -> Option<usize> {
408         let delta = self.trie.get(pos)?;
409         let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
410             // nothing to do
411             pos + 1 + delta as usize
412         } else if delta == THREE_UNIT_DELTA_LEAD {
413             let delta =
414                 ((self.trie.get(pos + 1)? as usize) << 16) | (self.trie.get(pos + 2)? as usize);
415             pos + delta + 3
416         } else {
417             let delta = (((delta - MIN_TWO_UNIT_DELTA_LEAD) as usize) << 16)
418                 | (self.trie.get(pos + 1)? as usize);
419             pos + delta + 2
420         };
421         Some(v)
422     }
423 
424     #[inline(always)] // 1 call site and we want the Option to go away
skip_value(&self, pos: usize) -> Option<usize>425     fn skip_value(&self, pos: usize) -> Option<usize> {
426         let lead_unit = self.trie.get(pos)?;
427         Some(skip_value(pos + 1, lead_unit & 0x7fff))
428     }
429 
430     #[inline(always)] // 1 call site and we want the Option to go away
skip_delta(&self, pos: usize) -> Option<usize>431     fn skip_delta(&self, pos: usize) -> Option<usize> {
432         let delta = self.trie.get(pos)?;
433         let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
434             pos + 1
435         } else if delta == THREE_UNIT_DELTA_LEAD {
436             pos + 3
437         } else {
438             pos + 2
439         };
440         Some(v)
441     }
442 
value_result(&self, pos: usize) -> TrieResult443     fn value_result(&self, pos: usize) -> TrieResult {
444         match self.get_value(pos) {
445             Some(result) => result,
446             None => {
447                 // Unexpected
448                 debug_assert!(false);
449                 TrieResult::NoMatch
450             }
451         }
452     }
453 
454     #[inline(always)] // 1 call site and we want the Option to go away
get_value(&self, pos: usize) -> Option<TrieResult>455     fn get_value(&self, pos: usize) -> Option<TrieResult> {
456         let lead_unit = self.trie.get(pos)?;
457         if lead_unit & VALUE_IS_FINAL == VALUE_IS_FINAL {
458             self.read_value(pos + 1, lead_unit & 0x7fff)
459                 .map(TrieResult::FinalValue)
460         } else {
461             self.read_node_value(pos + 1, lead_unit)
462                 .map(TrieResult::Intermediate)
463         }
464     }
465 
466     #[inline(always)] // 1 call site and we want the Option to go away
read_value(&self, pos: usize, lead_unit: u16) -> Option<i32>467     fn read_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
468         let v = if lead_unit < MIN_TWO_UNIT_VALUE_LEAD {
469             lead_unit.into()
470         } else if lead_unit < THREE_UNIT_VALUE_LEAD {
471             (((lead_unit - MIN_TWO_UNIT_VALUE_LEAD) as i32) << 16) | self.trie.get(pos)? as i32
472         } else {
473             ((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
474         };
475         Some(v)
476     }
477 
478     #[inline(always)] // 1 call site and we want the Option to go away
read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32>479     fn read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
480         let v = if lead_unit < (MIN_TWO_UNIT_NODE_VALUE_LEAD) {
481             ((lead_unit >> 6) - 1).into()
482         } else if lead_unit < THREE_UNIT_NODE_VALUE_LEAD {
483             ((((lead_unit & 0x7fc0) - MIN_TWO_UNIT_NODE_VALUE_LEAD) as i32) << 10)
484                 | self.trie.get(pos)? as i32
485         } else {
486             ((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
487         };
488         Some(v)
489     }
490 }
491