1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! Types for walking stepwise through a trie. 6 //! 7 //! For examples, see the `.cursor()` functions 8 //! and the `Cursor` types in this module. 9 10 use crate::reader; 11 use crate::ZeroAsciiIgnoreCaseTrie; 12 use crate::ZeroTrieSimpleAscii; 13 14 use core::fmt; 15 16 impl<Store> ZeroTrieSimpleAscii<Store> 17 where 18 Store: AsRef<[u8]> + ?Sized, 19 { 20 /// Gets a cursor into the current trie. 21 /// 22 /// Useful to query a trie with data that is not a slice. 23 /// 24 /// This is currently supported only on [`ZeroTrieSimpleAscii`] 25 /// and [`ZeroAsciiIgnoreCaseTrie`]. 26 /// 27 /// # Examples 28 /// 29 /// Get a value out of a trie by [writing](fmt::Write) it to the cursor: 30 /// 31 /// ``` 32 /// use core::fmt::Write; 33 /// use zerotrie::ZeroTrieSimpleAscii; 34 /// 35 /// // A trie with two values: "abc" and "abcdef" 36 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); 37 /// 38 /// // Get out the value for "abc" 39 /// let mut cursor = trie.cursor(); 40 /// write!(&mut cursor, "abc"); 41 /// assert_eq!(cursor.take_value(), Some(0)); 42 /// ``` 43 /// 44 /// Find the longest prefix match: 45 /// 46 /// ``` 47 /// use zerotrie::ZeroTrieSimpleAscii; 48 /// 49 /// // A trie with two values: "abc" and "abcdef" 50 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); 51 /// 52 /// // Find the longest prefix of the string "abcdxy": 53 /// let query = b"abcdxy"; 54 /// let mut longest_prefix = 0; 55 /// let mut cursor = trie.cursor(); 56 /// for (i, b) in query.iter().enumerate() { 57 /// // Checking is_empty() is not required, but it is 58 /// // good for efficiency 59 /// if cursor.is_empty() { 60 /// break; 61 /// } 62 /// if cursor.take_value().is_some() { 63 /// longest_prefix = i; 64 /// } 65 /// cursor.step(*b); 66 /// } 67 /// 68 /// // The longest prefix is "abc" which is length 3: 69 /// assert_eq!(longest_prefix, 3); 70 /// ``` 71 #[inline] cursor(&self) -> ZeroTrieSimpleAsciiCursor72 pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { 73 ZeroTrieSimpleAsciiCursor { 74 trie: self.as_borrowed_slice(), 75 } 76 } 77 } 78 79 impl<Store> ZeroAsciiIgnoreCaseTrie<Store> 80 where 81 Store: AsRef<[u8]> + ?Sized, 82 { 83 /// Gets a cursor into the current trie. 84 /// 85 /// Useful to query a trie with data that is not a slice. 86 /// 87 /// This is currently supported only on [`ZeroTrieSimpleAscii`] 88 /// and [`ZeroAsciiIgnoreCaseTrie`]. 89 /// 90 /// # Examples 91 /// 92 /// Get a value out of a trie by [writing](fmt::Write) it to the cursor: 93 /// 94 /// ``` 95 /// use core::fmt::Write; 96 /// use zerotrie::ZeroAsciiIgnoreCaseTrie; 97 /// 98 /// // A trie with two values: "aBc" and "aBcdEf" 99 /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81"); 100 /// 101 /// // Get out the value for "abc" (case-insensitive!) 102 /// let mut cursor = trie.cursor(); 103 /// write!(&mut cursor, "abc"); 104 /// assert_eq!(cursor.take_value(), Some(0)); 105 /// ``` 106 /// 107 /// For more examples, see [`ZeroTrieSimpleAscii::cursor`]. 108 #[inline] cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor109 pub fn cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor { 110 ZeroAsciiIgnoreCaseTrieCursor { 111 trie: self.as_borrowed_slice(), 112 } 113 } 114 } 115 116 impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { 117 /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid 118 /// having to doubly anchor the trie to the stack. 119 #[inline] into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a>120 pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { 121 ZeroTrieSimpleAsciiCursor { trie: self } 122 } 123 } 124 125 impl<'a> ZeroAsciiIgnoreCaseTrie<&'a [u8]> { 126 /// Same as [`ZeroAsciiIgnoreCaseTrie::cursor()`] but moves self to avoid 127 /// having to doubly anchor the trie to the stack. 128 #[inline] into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a>129 pub fn into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a> { 130 ZeroAsciiIgnoreCaseTrieCursor { trie: self } 131 } 132 } 133 134 /// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. 135 /// 136 /// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. 137 // Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170> 138 #[derive(Debug, Clone)] 139 pub struct ZeroTrieSimpleAsciiCursor<'a> { 140 trie: ZeroTrieSimpleAscii<&'a [u8]>, 141 } 142 143 /// A cursor into a [`ZeroAsciiIgnoreCaseTrie`], useful for stepwise lookup. 144 /// 145 /// For examples, see [`ZeroAsciiIgnoreCaseTrie::cursor()`]. 146 // Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170> 147 #[derive(Debug, Clone)] 148 pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> { 149 trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>, 150 } 151 152 /// Information about a probed edge. 153 #[derive(Debug, Copy, Clone, PartialEq, Eq)] 154 #[non_exhaustive] // no need to destructure or construct this in userland 155 pub struct AsciiProbeResult { 156 /// The character's byte value between this node and its parent. 157 pub byte: u8, 158 /// The number of siblings of this node, _including itself_. 159 pub total_siblings: u8, 160 } 161 162 impl ZeroTrieSimpleAsciiCursor<'_> { 163 /// Steps the cursor one character into the trie based on the character's byte value. 164 /// 165 /// # Examples 166 /// 167 /// Unrolled loop checking for string presence at every step: 168 /// 169 /// ``` 170 /// use zerotrie::ZeroTrieSimpleAscii; 171 /// 172 /// // A trie with two values: "abc" and "abcdef" 173 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); 174 /// 175 /// // Search the trie for the string "abcdxy" 176 /// let mut cursor = trie.cursor(); 177 /// assert_eq!(cursor.take_value(), None); // "" 178 /// cursor.step(b'a'); 179 /// assert_eq!(cursor.take_value(), None); // "a" 180 /// cursor.step(b'b'); 181 /// assert_eq!(cursor.take_value(), None); // "ab" 182 /// cursor.step(b'c'); 183 /// assert_eq!(cursor.take_value(), Some(0)); // "abc" 184 /// cursor.step(b'd'); 185 /// assert_eq!(cursor.take_value(), None); // "abcd" 186 /// assert!(!cursor.is_empty()); 187 /// cursor.step(b'x'); // no strings have the prefix "abcdx" 188 /// assert!(cursor.is_empty()); 189 /// assert_eq!(cursor.take_value(), None); // "abcdx" 190 /// cursor.step(b'y'); 191 /// assert_eq!(cursor.take_value(), None); // "abcdxy" 192 /// ``` 193 /// 194 /// If the byte is not ASCII, the cursor will become empty: 195 /// 196 /// ``` 197 /// use zerotrie::ZeroTrieSimpleAscii; 198 /// 199 /// // A trie with two values: "abc" and "abcdef" 200 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); 201 /// 202 /// let mut cursor = trie.cursor(); 203 /// assert_eq!(cursor.take_value(), None); // "" 204 /// cursor.step(b'a'); 205 /// assert_eq!(cursor.take_value(), None); // "a" 206 /// cursor.step(b'b'); 207 /// assert_eq!(cursor.take_value(), None); // "ab" 208 /// cursor.step(b'\xFF'); 209 /// assert!(cursor.is_empty()); 210 /// assert_eq!(cursor.take_value(), None); 211 /// ``` 212 #[inline] step(&mut self, byte: u8)213 pub fn step(&mut self, byte: u8) { 214 reader::step_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, byte); 215 } 216 217 /// Takes the value at the current position. 218 /// 219 /// Calling this function on a new cursor is equivalent to calling `.get()` 220 /// with the empty string (except that it can only be called once). 221 /// 222 /// # Examples 223 /// 224 /// ``` 225 /// use zerotrie::ZeroTrieSimpleAscii; 226 /// 227 /// // A trie with two values: "" and "abc" 228 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); 229 /// 230 /// assert_eq!(Some(0), trie.get("")); 231 /// let mut cursor = trie.cursor(); 232 /// assert_eq!(Some(0), cursor.take_value()); 233 /// assert_eq!(None, cursor.take_value()); 234 /// ``` 235 #[inline] take_value(&mut self) -> Option<usize>236 pub fn take_value(&mut self) -> Option<usize> { 237 reader::take_value(&mut self.trie.store) 238 } 239 240 /// Steps the cursor one character into the trie based on an edge index, 241 /// returning the corresponding character as a byte. 242 /// 243 /// This function is similar to [`Self::step()`], but it takes an index instead of a char. 244 /// This enables stepwise iteration over the contents of the trie. 245 /// 246 /// If there are multiple possibilities for the next byte, the `index` argument allows 247 /// visiting them in order. Since this function steps the cursor, the cursor must be 248 /// cloned (a cheap operation) in order to visit multiple children. 249 /// 250 /// # Examples 251 /// 252 /// Continually query index 0 to extract the first item from a trie: 253 /// 254 /// ``` 255 /// use zerotrie::ZeroTrieSimpleAscii; 256 /// 257 /// let data: &[(String, usize)] = &[ 258 /// ("ab".to_string(), 111), 259 /// ("abcxyz".to_string(), 22), 260 /// ("abde".to_string(), 333), 261 /// ("afg".to_string(), 44), 262 /// ]; 263 /// 264 /// let trie: ZeroTrieSimpleAscii<Vec<u8>> = 265 /// data.iter().map(|(s, v)| (s.as_str(), *v)).collect(); 266 /// 267 /// let mut cursor = trie.cursor(); 268 /// let mut key = String::new(); 269 /// let value = loop { 270 /// if let Some(value) = cursor.take_value() { 271 /// break value; 272 /// } 273 /// let probe_result = cursor.probe(0).unwrap(); 274 /// key.push(char::from(probe_result.byte)); 275 /// }; 276 /// 277 /// assert_eq!(key, "ab"); 278 /// assert_eq!(value, 111); 279 /// ``` 280 /// 281 /// Stepwise iterate over all entries in the trie: 282 /// 283 /// ``` 284 /// # use zerotrie::ZeroTrieSimpleAscii; 285 /// # let data: &[(String, usize)] = &[ 286 /// # ("ab".to_string(), 111), 287 /// # ("abcxyz".to_string(), 22), 288 /// # ("abde".to_string(), 333), 289 /// # ("afg".to_string(), 44) 290 /// # ]; 291 /// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data 292 /// # .iter() 293 /// # .map(|(s, v)| (s.as_str(), *v)) 294 /// # .collect(); 295 /// // (trie built as in previous example) 296 /// 297 /// // Initialize the iteration at the first child of the trie. 298 /// let mut stack = Vec::from([(trie.cursor(), 0, 0)]); 299 /// let mut key = Vec::new(); 300 /// let mut results = Vec::new(); 301 /// loop { 302 /// let Some((mut cursor, index, suffix_len)) = stack.pop() else { 303 /// // Nothing left in the trie. 304 /// break; 305 /// }; 306 /// // Check to see if there is a value at the current node. 307 /// if let Some(value) = cursor.take_value() { 308 /// results.push((String::from_utf8(key.clone()).unwrap(), value)); 309 /// } 310 /// // Now check for children of the current node. 311 /// let mut sub_cursor = cursor.clone(); 312 /// if let Some(probe_result) = sub_cursor.probe(index) { 313 /// // Found a child. Add the current byte edge to the key. 314 /// key.push(probe_result.byte); 315 /// // Add the child to the stack, and also add back the current 316 /// // node if there are more siblings to visit. 317 /// if index + 1 < probe_result.total_siblings as usize { 318 /// stack.push((cursor, index + 1, suffix_len)); 319 /// stack.push((sub_cursor, 0, 1)); 320 /// } else { 321 /// stack.push((sub_cursor, 0, suffix_len + 1)); 322 /// } 323 /// } else { 324 /// // No more children. Pop this node's bytes from the key. 325 /// for _ in 0..suffix_len { 326 /// key.pop(); 327 /// } 328 /// } 329 /// } 330 /// 331 /// assert_eq!(&results, data); 332 /// ``` probe(&mut self, index: usize) -> Option<AsciiProbeResult>333 pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> { 334 reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index) 335 } 336 337 /// Checks whether the cursor points to an empty trie. 338 /// 339 /// Use this to determine when to stop iterating. 340 #[inline] is_empty(&self) -> bool341 pub fn is_empty(&self) -> bool { 342 self.trie.is_empty() 343 } 344 } 345 346 impl ZeroAsciiIgnoreCaseTrieCursor<'_> { 347 /// Steps the cursor one byte into the trie. 348 /// 349 /// Returns the byte if matched, which may be a different case than the input byte. 350 /// If this function returns `None`, any lookup loops can be terminated. 351 /// 352 /// # Examples 353 /// 354 /// Normalize the case of a value by stepping through an ignore-case trie: 355 /// 356 /// ``` 357 /// use std::borrow::Cow; 358 /// use zerotrie::ZeroAsciiIgnoreCaseTrie; 359 /// 360 /// // A trie with two values: "aBc" and "aBcdEf" 361 /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81"); 362 /// 363 /// // Get out the value for "abc" and normalize the key string 364 /// let mut cursor = trie.cursor(); 365 /// let mut key_str = Cow::Borrowed("abc".as_bytes()); 366 /// let mut i = 0; 367 /// let value = loop { 368 /// let Some(&input_byte) = key_str.get(i) else { 369 /// break cursor.take_value(); 370 /// }; 371 /// let Some(matched_byte) = cursor.step(input_byte) else { 372 /// break None; 373 /// }; 374 /// if matched_byte != input_byte { 375 /// key_str.to_mut()[i] = matched_byte; 376 /// } 377 /// i += 1; 378 /// }; 379 /// 380 /// assert_eq!(value, Some(0)); 381 /// assert_eq!(&*key_str, "aBc".as_bytes()); 382 /// ``` 383 /// 384 /// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`]. 385 #[inline] step(&mut self, byte: u8) -> Option<u8>386 pub fn step(&mut self, byte: u8) -> Option<u8> { 387 reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte) 388 } 389 390 /// Takes the value at the current position. 391 /// 392 /// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`]. 393 #[inline] take_value(&mut self) -> Option<usize>394 pub fn take_value(&mut self) -> Option<usize> { 395 reader::take_value(&mut self.trie.store) 396 } 397 398 /// Probes the next byte in the cursor. 399 /// 400 /// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`]. probe(&mut self, index: usize) -> Option<AsciiProbeResult>401 pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> { 402 reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index) 403 } 404 405 /// Checks whether the cursor points to an empty trie. 406 /// 407 /// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`]. 408 #[inline] is_empty(&self) -> bool409 pub fn is_empty(&self) -> bool { 410 self.trie.is_empty() 411 } 412 } 413 414 impl fmt::Write for ZeroTrieSimpleAsciiCursor<'_> { 415 /// Steps the cursor through each ASCII byte of the string. 416 /// 417 /// If the string contains non-ASCII chars, an error is returned. 418 /// 419 /// # Examples 420 /// 421 /// ``` 422 /// use core::fmt::Write; 423 /// use zerotrie::ZeroTrieSimpleAscii; 424 /// 425 /// // A trie with two values: "abc" and "abcdef" 426 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); 427 /// 428 /// let mut cursor = trie.cursor(); 429 /// cursor.write_str("abcdxy").expect("all ASCII"); 430 /// cursor.write_str("").expect_err("non-ASCII"); 431 /// ``` write_str(&mut self, s: &str) -> fmt::Result432 fn write_str(&mut self, s: &str) -> fmt::Result { 433 for b in s.bytes() { 434 if !b.is_ascii() { 435 return Err(fmt::Error); 436 } 437 self.step(b); 438 } 439 Ok(()) 440 } 441 442 /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns 443 /// an error if the char is non-ASCII. 444 /// 445 /// # Examples 446 /// 447 /// ``` 448 /// use core::fmt::Write; 449 /// use zerotrie::ZeroTrieSimpleAscii; 450 /// 451 /// // A trie with two values: "abc" and "abcdef" 452 /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); 453 /// 454 /// let mut cursor = trie.cursor(); 455 /// cursor.write_char('a').expect("ASCII"); 456 /// cursor.write_char('x').expect("ASCII"); 457 /// cursor.write_char('').expect_err("non-ASCII"); 458 /// ``` write_char(&mut self, c: char) -> fmt::Result459 fn write_char(&mut self, c: char) -> fmt::Result { 460 if !c.is_ascii() { 461 return Err(fmt::Error); 462 } 463 self.step(c as u8); 464 Ok(()) 465 } 466 } 467 468 impl fmt::Write for ZeroAsciiIgnoreCaseTrieCursor<'_> { 469 /// Steps the cursor through each ASCII byte of the string. 470 /// 471 /// If the string contains non-ASCII chars, an error is returned. write_str(&mut self, s: &str) -> fmt::Result472 fn write_str(&mut self, s: &str) -> fmt::Result { 473 for b in s.bytes() { 474 if !b.is_ascii() { 475 return Err(fmt::Error); 476 } 477 self.step(b); 478 } 479 Ok(()) 480 } 481 482 /// Equivalent to [`ZeroAsciiIgnoreCaseTrieCursor::step()`], except returns 483 /// an error if the char is non-ASCII. write_char(&mut self, c: char) -> fmt::Result484 fn write_char(&mut self, c: char) -> fmt::Result { 485 if !c.is_ascii() { 486 return Err(fmt::Error); 487 } 488 self.step(c as u8); 489 Ok(()) 490 } 491 } 492