• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! Types for walking stepwise through a trie.
6 //!
7 //! For examples, see the `.cursor()` functions
8 //! and the `Cursor` types in this module.
9 
10 use crate::reader;
11 use crate::ZeroAsciiIgnoreCaseTrie;
12 use crate::ZeroTrieSimpleAscii;
13 
14 use core::fmt;
15 
16 impl<Store> ZeroTrieSimpleAscii<Store>
17 where
18     Store: AsRef<[u8]> + ?Sized,
19 {
20     /// Gets a cursor into the current trie.
21     ///
22     /// Useful to query a trie with data that is not a slice.
23     ///
24     /// This is currently supported only on [`ZeroTrieSimpleAscii`]
25     /// and [`ZeroAsciiIgnoreCaseTrie`].
26     ///
27     /// # Examples
28     ///
29     /// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
30     ///
31     /// ```
32     /// use core::fmt::Write;
33     /// use zerotrie::ZeroTrieSimpleAscii;
34     ///
35     /// // A trie with two values: "abc" and "abcdef"
36     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
37     ///
38     /// // Get out the value for "abc"
39     /// let mut cursor = trie.cursor();
40     /// write!(&mut cursor, "abc");
41     /// assert_eq!(cursor.take_value(), Some(0));
42     /// ```
43     ///
44     /// Find the longest prefix match:
45     ///
46     /// ```
47     /// use zerotrie::ZeroTrieSimpleAscii;
48     ///
49     /// // A trie with two values: "abc" and "abcdef"
50     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
51     ///
52     /// // Find the longest prefix of the string "abcdxy":
53     /// let query = b"abcdxy";
54     /// let mut longest_prefix = 0;
55     /// let mut cursor = trie.cursor();
56     /// for (i, b) in query.iter().enumerate() {
57     ///     // Checking is_empty() is not required, but it is
58     ///     // good for efficiency
59     ///     if cursor.is_empty() {
60     ///         break;
61     ///     }
62     ///     if cursor.take_value().is_some() {
63     ///         longest_prefix = i;
64     ///     }
65     ///     cursor.step(*b);
66     /// }
67     ///
68     /// // The longest prefix is "abc" which is length 3:
69     /// assert_eq!(longest_prefix, 3);
70     /// ```
71     #[inline]
cursor(&self) -> ZeroTrieSimpleAsciiCursor72     pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor {
73         ZeroTrieSimpleAsciiCursor {
74             trie: self.as_borrowed_slice(),
75         }
76     }
77 }
78 
79 impl<Store> ZeroAsciiIgnoreCaseTrie<Store>
80 where
81     Store: AsRef<[u8]> + ?Sized,
82 {
83     /// Gets a cursor into the current trie.
84     ///
85     /// Useful to query a trie with data that is not a slice.
86     ///
87     /// This is currently supported only on [`ZeroTrieSimpleAscii`]
88     /// and [`ZeroAsciiIgnoreCaseTrie`].
89     ///
90     /// # Examples
91     ///
92     /// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
93     ///
94     /// ```
95     /// use core::fmt::Write;
96     /// use zerotrie::ZeroAsciiIgnoreCaseTrie;
97     ///
98     /// // A trie with two values: "aBc" and "aBcdEf"
99     /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
100     ///
101     /// // Get out the value for "abc" (case-insensitive!)
102     /// let mut cursor = trie.cursor();
103     /// write!(&mut cursor, "abc");
104     /// assert_eq!(cursor.take_value(), Some(0));
105     /// ```
106     ///
107     /// For more examples, see [`ZeroTrieSimpleAscii::cursor`].
108     #[inline]
cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor109     pub fn cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor {
110         ZeroAsciiIgnoreCaseTrieCursor {
111             trie: self.as_borrowed_slice(),
112         }
113     }
114 }
115 
116 impl<'a> ZeroTrieSimpleAscii<&'a [u8]> {
117     /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid
118     /// having to doubly anchor the trie to the stack.
119     #[inline]
into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a>120     pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> {
121         ZeroTrieSimpleAsciiCursor { trie: self }
122     }
123 }
124 
125 impl<'a> ZeroAsciiIgnoreCaseTrie<&'a [u8]> {
126     /// Same as [`ZeroAsciiIgnoreCaseTrie::cursor()`] but moves self to avoid
127     /// having to doubly anchor the trie to the stack.
128     #[inline]
into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a>129     pub fn into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a> {
130         ZeroAsciiIgnoreCaseTrieCursor { trie: self }
131     }
132 }
133 
134 /// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup.
135 ///
136 /// For examples, see [`ZeroTrieSimpleAscii::cursor()`].
137 // Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
138 #[derive(Debug, Clone)]
139 pub struct ZeroTrieSimpleAsciiCursor<'a> {
140     trie: ZeroTrieSimpleAscii<&'a [u8]>,
141 }
142 
143 /// A cursor into a [`ZeroAsciiIgnoreCaseTrie`], useful for stepwise lookup.
144 ///
145 /// For examples, see [`ZeroAsciiIgnoreCaseTrie::cursor()`].
146 // Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
147 #[derive(Debug, Clone)]
148 pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> {
149     trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>,
150 }
151 
152 /// Information about a probed edge.
153 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
154 #[non_exhaustive] // no need to destructure or construct this in userland
155 pub struct AsciiProbeResult {
156     /// The character's byte value between this node and its parent.
157     pub byte: u8,
158     /// The number of siblings of this node, _including itself_.
159     pub total_siblings: u8,
160 }
161 
162 impl ZeroTrieSimpleAsciiCursor<'_> {
163     /// Steps the cursor one character into the trie based on the character's byte value.
164     ///
165     /// # Examples
166     ///
167     /// Unrolled loop checking for string presence at every step:
168     ///
169     /// ```
170     /// use zerotrie::ZeroTrieSimpleAscii;
171     ///
172     /// // A trie with two values: "abc" and "abcdef"
173     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
174     ///
175     /// // Search the trie for the string "abcdxy"
176     /// let mut cursor = trie.cursor();
177     /// assert_eq!(cursor.take_value(), None); // ""
178     /// cursor.step(b'a');
179     /// assert_eq!(cursor.take_value(), None); // "a"
180     /// cursor.step(b'b');
181     /// assert_eq!(cursor.take_value(), None); // "ab"
182     /// cursor.step(b'c');
183     /// assert_eq!(cursor.take_value(), Some(0)); // "abc"
184     /// cursor.step(b'd');
185     /// assert_eq!(cursor.take_value(), None); // "abcd"
186     /// assert!(!cursor.is_empty());
187     /// cursor.step(b'x'); // no strings have the prefix "abcdx"
188     /// assert!(cursor.is_empty());
189     /// assert_eq!(cursor.take_value(), None); // "abcdx"
190     /// cursor.step(b'y');
191     /// assert_eq!(cursor.take_value(), None); // "abcdxy"
192     /// ```
193     ///
194     /// If the byte is not ASCII, the cursor will become empty:
195     ///
196     /// ```
197     /// use zerotrie::ZeroTrieSimpleAscii;
198     ///
199     /// // A trie with two values: "abc" and "abcdef"
200     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
201     ///
202     /// let mut cursor = trie.cursor();
203     /// assert_eq!(cursor.take_value(), None); // ""
204     /// cursor.step(b'a');
205     /// assert_eq!(cursor.take_value(), None); // "a"
206     /// cursor.step(b'b');
207     /// assert_eq!(cursor.take_value(), None); // "ab"
208     /// cursor.step(b'\xFF');
209     /// assert!(cursor.is_empty());
210     /// assert_eq!(cursor.take_value(), None);
211     /// ```
212     #[inline]
step(&mut self, byte: u8)213     pub fn step(&mut self, byte: u8) {
214         reader::step_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, byte);
215     }
216 
217     /// Takes the value at the current position.
218     ///
219     /// Calling this function on a new cursor is equivalent to calling `.get()`
220     /// with the empty string (except that it can only be called once).
221     ///
222     /// # Examples
223     ///
224     /// ```
225     /// use zerotrie::ZeroTrieSimpleAscii;
226     ///
227     /// // A trie with two values: "" and "abc"
228     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81");
229     ///
230     /// assert_eq!(Some(0), trie.get(""));
231     /// let mut cursor = trie.cursor();
232     /// assert_eq!(Some(0), cursor.take_value());
233     /// assert_eq!(None, cursor.take_value());
234     /// ```
235     #[inline]
take_value(&mut self) -> Option<usize>236     pub fn take_value(&mut self) -> Option<usize> {
237         reader::take_value(&mut self.trie.store)
238     }
239 
240     /// Steps the cursor one character into the trie based on an edge index,
241     /// returning the corresponding character as a byte.
242     ///
243     /// This function is similar to [`Self::step()`], but it takes an index instead of a char.
244     /// This enables stepwise iteration over the contents of the trie.
245     ///
246     /// If there are multiple possibilities for the next byte, the `index` argument allows
247     /// visiting them in order. Since this function steps the cursor, the cursor must be
248     /// cloned (a cheap operation) in order to visit multiple children.
249     ///
250     /// # Examples
251     ///
252     /// Continually query index 0 to extract the first item from a trie:
253     ///
254     /// ```
255     /// use zerotrie::ZeroTrieSimpleAscii;
256     ///
257     /// let data: &[(String, usize)] = &[
258     ///     ("ab".to_string(), 111),
259     ///     ("abcxyz".to_string(), 22),
260     ///     ("abde".to_string(), 333),
261     ///     ("afg".to_string(), 44),
262     /// ];
263     ///
264     /// let trie: ZeroTrieSimpleAscii<Vec<u8>> =
265     ///     data.iter().map(|(s, v)| (s.as_str(), *v)).collect();
266     ///
267     /// let mut cursor = trie.cursor();
268     /// let mut key = String::new();
269     /// let value = loop {
270     ///     if let Some(value) = cursor.take_value() {
271     ///         break value;
272     ///     }
273     ///     let probe_result = cursor.probe(0).unwrap();
274     ///     key.push(char::from(probe_result.byte));
275     /// };
276     ///
277     /// assert_eq!(key, "ab");
278     /// assert_eq!(value, 111);
279     /// ```
280     ///
281     /// Stepwise iterate over all entries in the trie:
282     ///
283     /// ```
284     /// # use zerotrie::ZeroTrieSimpleAscii;
285     /// # let data: &[(String, usize)] = &[
286     /// #     ("ab".to_string(), 111),
287     /// #     ("abcxyz".to_string(), 22),
288     /// #     ("abde".to_string(), 333),
289     /// #     ("afg".to_string(), 44)
290     /// # ];
291     /// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data
292     /// #     .iter()
293     /// #     .map(|(s, v)| (s.as_str(), *v))
294     /// #     .collect();
295     /// // (trie built as in previous example)
296     ///
297     /// // Initialize the iteration at the first child of the trie.
298     /// let mut stack = Vec::from([(trie.cursor(), 0, 0)]);
299     /// let mut key = Vec::new();
300     /// let mut results = Vec::new();
301     /// loop {
302     ///     let Some((mut cursor, index, suffix_len)) = stack.pop() else {
303     ///         // Nothing left in the trie.
304     ///         break;
305     ///     };
306     ///     // Check to see if there is a value at the current node.
307     ///     if let Some(value) = cursor.take_value() {
308     ///         results.push((String::from_utf8(key.clone()).unwrap(), value));
309     ///     }
310     ///     // Now check for children of the current node.
311     ///     let mut sub_cursor = cursor.clone();
312     ///     if let Some(probe_result) = sub_cursor.probe(index) {
313     ///         // Found a child. Add the current byte edge to the key.
314     ///         key.push(probe_result.byte);
315     ///         // Add the child to the stack, and also add back the current
316     ///         // node if there are more siblings to visit.
317     ///         if index + 1 < probe_result.total_siblings as usize {
318     ///             stack.push((cursor, index + 1, suffix_len));
319     ///             stack.push((sub_cursor, 0, 1));
320     ///         } else {
321     ///             stack.push((sub_cursor, 0, suffix_len + 1));
322     ///         }
323     ///     } else {
324     ///         // No more children. Pop this node's bytes from the key.
325     ///         for _ in 0..suffix_len {
326     ///             key.pop();
327     ///         }
328     ///     }
329     /// }
330     ///
331     /// assert_eq!(&results, data);
332     /// ```
probe(&mut self, index: usize) -> Option<AsciiProbeResult>333     pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
334         reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
335     }
336 
337     /// Checks whether the cursor points to an empty trie.
338     ///
339     /// Use this to determine when to stop iterating.
340     #[inline]
is_empty(&self) -> bool341     pub fn is_empty(&self) -> bool {
342         self.trie.is_empty()
343     }
344 }
345 
346 impl ZeroAsciiIgnoreCaseTrieCursor<'_> {
347     /// Steps the cursor one byte into the trie.
348     ///
349     /// Returns the byte if matched, which may be a different case than the input byte.
350     /// If this function returns `None`, any lookup loops can be terminated.
351     ///
352     /// # Examples
353     ///
354     /// Normalize the case of a value by stepping through an ignore-case trie:
355     ///
356     /// ```
357     /// use std::borrow::Cow;
358     /// use zerotrie::ZeroAsciiIgnoreCaseTrie;
359     ///
360     /// // A trie with two values: "aBc" and "aBcdEf"
361     /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
362     ///
363     /// // Get out the value for "abc" and normalize the key string
364     /// let mut cursor = trie.cursor();
365     /// let mut key_str = Cow::Borrowed("abc".as_bytes());
366     /// let mut i = 0;
367     /// let value = loop {
368     ///     let Some(&input_byte) = key_str.get(i) else {
369     ///         break cursor.take_value();
370     ///     };
371     ///     let Some(matched_byte) = cursor.step(input_byte) else {
372     ///         break None;
373     ///     };
374     ///     if matched_byte != input_byte {
375     ///         key_str.to_mut()[i] = matched_byte;
376     ///     }
377     ///     i += 1;
378     /// };
379     ///
380     /// assert_eq!(value, Some(0));
381     /// assert_eq!(&*key_str, "aBc".as_bytes());
382     /// ```
383     ///
384     /// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`].
385     #[inline]
step(&mut self, byte: u8) -> Option<u8>386     pub fn step(&mut self, byte: u8) -> Option<u8> {
387         reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte)
388     }
389 
390     /// Takes the value at the current position.
391     ///
392     /// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`].
393     #[inline]
take_value(&mut self) -> Option<usize>394     pub fn take_value(&mut self) -> Option<usize> {
395         reader::take_value(&mut self.trie.store)
396     }
397 
398     /// Probes the next byte in the cursor.
399     ///
400     /// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
probe(&mut self, index: usize) -> Option<AsciiProbeResult>401     pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
402         reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
403     }
404 
405     /// Checks whether the cursor points to an empty trie.
406     ///
407     /// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`].
408     #[inline]
is_empty(&self) -> bool409     pub fn is_empty(&self) -> bool {
410         self.trie.is_empty()
411     }
412 }
413 
414 impl fmt::Write for ZeroTrieSimpleAsciiCursor<'_> {
415     /// Steps the cursor through each ASCII byte of the string.
416     ///
417     /// If the string contains non-ASCII chars, an error is returned.
418     ///
419     /// # Examples
420     ///
421     /// ```
422     /// use core::fmt::Write;
423     /// use zerotrie::ZeroTrieSimpleAscii;
424     ///
425     /// // A trie with two values: "abc" and "abcdef"
426     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
427     ///
428     /// let mut cursor = trie.cursor();
429     /// cursor.write_str("abcdxy").expect("all ASCII");
430     /// cursor.write_str("��").expect_err("non-ASCII");
431     /// ```
write_str(&mut self, s: &str) -> fmt::Result432     fn write_str(&mut self, s: &str) -> fmt::Result {
433         for b in s.bytes() {
434             if !b.is_ascii() {
435                 return Err(fmt::Error);
436             }
437             self.step(b);
438         }
439         Ok(())
440     }
441 
442     /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns
443     /// an error if the char is non-ASCII.
444     ///
445     /// # Examples
446     ///
447     /// ```
448     /// use core::fmt::Write;
449     /// use zerotrie::ZeroTrieSimpleAscii;
450     ///
451     /// // A trie with two values: "abc" and "abcdef"
452     /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
453     ///
454     /// let mut cursor = trie.cursor();
455     /// cursor.write_char('a').expect("ASCII");
456     /// cursor.write_char('x').expect("ASCII");
457     /// cursor.write_char('��').expect_err("non-ASCII");
458     /// ```
write_char(&mut self, c: char) -> fmt::Result459     fn write_char(&mut self, c: char) -> fmt::Result {
460         if !c.is_ascii() {
461             return Err(fmt::Error);
462         }
463         self.step(c as u8);
464         Ok(())
465     }
466 }
467 
468 impl fmt::Write for ZeroAsciiIgnoreCaseTrieCursor<'_> {
469     /// Steps the cursor through each ASCII byte of the string.
470     ///
471     /// If the string contains non-ASCII chars, an error is returned.
write_str(&mut self, s: &str) -> fmt::Result472     fn write_str(&mut self, s: &str) -> fmt::Result {
473         for b in s.bytes() {
474             if !b.is_ascii() {
475                 return Err(fmt::Error);
476             }
477             self.step(b);
478         }
479         Ok(())
480     }
481 
482     /// Equivalent to [`ZeroAsciiIgnoreCaseTrieCursor::step()`], except returns
483     /// an error if the char is non-ASCII.
write_char(&mut self, c: char) -> fmt::Result484     fn write_char(&mut self, c: char) -> fmt::Result {
485         if !c.is_ascii() {
486             return Err(fmt::Error);
487         }
488         self.step(c as u8);
489         Ok(())
490     }
491 }
492