1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 #![allow(rustdoc::private_intra_doc_links)] // doc(hidden) module
6
7 //! # Byte Perfect Hash Function Internals
8 //!
9 //! This module contains a perfect hash function (PHF) designed for a fast, compact perfect
10 //! hash over 1 to 256 nodes (bytes).
11 //!
12 //! The PHF uses the following variables:
13 //!
14 //! 1. A single parameter `p`, which is 0 in about 98% of cases.
15 //! 2. A list of `N` parameters `q_t`, one per _bucket_
16 //! 3. The `N` keys in an arbitrary order determined by the PHF
17 //!
18 //! Reading a `key` from the PHF uses the following algorithm:
19 //!
20 //! 1. Let `t`, the bucket index, be `f1(key, p)`.
21 //! 2. Let `i`, the key index, be `f2(key, q_t)`.
22 //! 3. If `key == k_i`, return `Some(i)`; else return `None`.
23 //!
24 //! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across
25 //! serialization versions of `ZeroTrie`. They are very fast, constant-time operations as long
26 //! as `p` <= [`P_FAST_MAX`] and `q` <= [`Q_FAST_MAX`]. In practice, nearly 100% of parameter
27 //! values are in the fast range.
28 //!
29 //! ```
30 //! use zerotrie::_internal::PerfectByteHashMap;
31 //!
32 //! let phf_example_bytes = [
33 //! // `p` parameter
34 //! 1, // `q` parameters, one for each of the N buckets
35 //! 0, 0, 1, 1, // Exact keys to be compared with the input
36 //! b'e', b'a', b'c', b'g',
37 //! ];
38 //!
39 //! let phf = PerfectByteHashMap::from_bytes(&phf_example_bytes);
40 //!
41 //! // The PHF returns the index of the key or `None` if not found.
42 //! assert_eq!(phf.get(b'a'), Some(1));
43 //! assert_eq!(phf.get(b'b'), None);
44 //! assert_eq!(phf.get(b'c'), Some(2));
45 //! assert_eq!(phf.get(b'd'), None);
46 //! assert_eq!(phf.get(b'e'), Some(0));
47 //! assert_eq!(phf.get(b'f'), None);
48 //! assert_eq!(phf.get(b'g'), Some(3));
49 //! ```
50
51 use crate::helpers::*;
52
53 #[cfg(feature = "alloc")]
54 mod builder;
55 #[cfg(feature = "alloc")]
56 mod cached_owned;
57
58 #[cfg(feature = "alloc")]
59 pub use cached_owned::PerfectByteHashMapCacheOwned;
60
61 /// The cutoff for the fast version of [`f1`].
62 #[cfg(feature = "alloc")] // used in the builder code
63 const P_FAST_MAX: u8 = 95;
64
65 /// The cutoff for the fast version of [`f2`].
66 const Q_FAST_MAX: u8 = 95;
67
68 /// The maximum allowable value of `p`. This could be raised if found to be necessary.
69 /// Values exceeding P_FAST_MAX could use a different `p` algorithm by modifying [`f1`].
70 #[cfg(feature = "alloc")] // used in the builder code
71 const P_REAL_MAX: u8 = P_FAST_MAX;
72
73 /// The maximum allowable value of `q`. This could be raised if found to be necessary.
74 #[cfg(feature = "alloc")] // used in the builder code
75 const Q_REAL_MAX: u8 = 127;
76
77 /// Calculates the function `f1` for the PHF. For the exact formula, please read the code.
78 ///
79 /// When `p == 0`, the operation is a simple modulus.
80 ///
81 /// The argument `n` is used only for taking the modulus so that the return value is
82 /// in the range `[0, n)`.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use zerotrie::_internal::f1;
88 /// const N: u8 = 10;
89 ///
90 /// // With p = 0:
91 /// assert_eq!(0, f1(0, 0, N));
92 /// assert_eq!(1, f1(1, 0, N));
93 /// assert_eq!(2, f1(2, 0, N));
94 /// assert_eq!(9, f1(9, 0, N));
95 /// assert_eq!(0, f1(10, 0, N));
96 /// assert_eq!(1, f1(11, 0, N));
97 /// assert_eq!(2, f1(12, 0, N));
98 /// assert_eq!(9, f1(19, 0, N));
99 ///
100 /// // With p = 1:
101 /// assert_eq!(1, f1(0, 1, N));
102 /// assert_eq!(0, f1(1, 1, N));
103 /// assert_eq!(2, f1(2, 1, N));
104 /// assert_eq!(2, f1(9, 1, N));
105 /// assert_eq!(4, f1(10, 1, N));
106 /// assert_eq!(5, f1(11, 1, N));
107 /// assert_eq!(1, f1(12, 1, N));
108 /// assert_eq!(7, f1(19, 1, N));
109 /// ```
110 #[inline]
f1(byte: u8, p: u8, n: u8) -> u8111 pub fn f1(byte: u8, p: u8, n: u8) -> u8 {
112 if n == 0 {
113 byte
114 } else if p == 0 {
115 byte % n
116 } else {
117 // `p` always uses the below constant-time operation. If needed, we
118 // could add some other operation here with `p > P_FAST_MAX` to solve
119 // difficult cases if the need arises.
120 let result = byte ^ p ^ byte.wrapping_shr(p as u32);
121 result % n
122 }
123 }
124
125 /// Calculates the function `f2` for the PHF. For the exact formula, please read the code.
126 ///
127 /// When `q == 0`, the operation is a simple modulus.
128 ///
129 /// The argument `n` is used only for taking the modulus so that the return value is
130 /// in the range `[0, n)`.
131 ///
132 /// # Examples
133 ///
134 /// ```
135 /// use zerotrie::_internal::f2;
136 /// const N: u8 = 10;
137 ///
138 /// // With q = 0:
139 /// assert_eq!(0, f2(0, 0, N));
140 /// assert_eq!(1, f2(1, 0, N));
141 /// assert_eq!(2, f2(2, 0, N));
142 /// assert_eq!(9, f2(9, 0, N));
143 /// assert_eq!(0, f2(10, 0, N));
144 /// assert_eq!(1, f2(11, 0, N));
145 /// assert_eq!(2, f2(12, 0, N));
146 /// assert_eq!(9, f2(19, 0, N));
147 ///
148 /// // With q = 1:
149 /// assert_eq!(1, f2(0, 1, N));
150 /// assert_eq!(0, f2(1, 1, N));
151 /// assert_eq!(3, f2(2, 1, N));
152 /// assert_eq!(8, f2(9, 1, N));
153 /// assert_eq!(1, f2(10, 1, N));
154 /// assert_eq!(0, f2(11, 1, N));
155 /// assert_eq!(3, f2(12, 1, N));
156 /// assert_eq!(8, f2(19, 1, N));
157 /// ```
158 #[inline]
f2(byte: u8, q: u8, n: u8) -> u8159 pub fn f2(byte: u8, q: u8, n: u8) -> u8 {
160 if n == 0 {
161 return byte;
162 }
163 let mut result = byte ^ q;
164 // In almost all cases, the PHF works with the above constant-time operation.
165 // However, to crack a few difficult cases, we fall back to the linear-time
166 // operation shown below.
167 for _ in Q_FAST_MAX..q {
168 result = result ^ (result << 1) ^ (result >> 1);
169 }
170 result % n
171 }
172
173 /// A constant-time map from bytes to unique indices.
174 ///
175 /// Uses a perfect hash function (see module-level documentation). Does not support mutation.
176 ///
177 /// Standard layout: P, N bytes of Q, N bytes of expected keys
178 #[derive(Debug, PartialEq, Eq)]
179 #[repr(transparent)]
180 pub struct PerfectByteHashMap<Store: ?Sized>(Store);
181
182 impl<Store> PerfectByteHashMap<Store> {
183 /// Creates an instance from a pre-existing store. See [`Self::as_bytes`].
184 #[inline]
from_store(store: Store) -> Self185 pub fn from_store(store: Store) -> Self {
186 Self(store)
187 }
188 }
189
190 impl<Store> PerfectByteHashMap<Store>
191 where
192 Store: AsRef<[u8]> + ?Sized,
193 {
194 /// Gets the usize for the given byte, or `None` if it is not in the map.
get(&self, key: u8) -> Option<usize>195 pub fn get(&self, key: u8) -> Option<usize> {
196 let (p, buffer) = self.0.as_ref().split_first()?;
197 // Note: there are N buckets followed by N keys
198 let n_usize = buffer.len() / 2;
199 if n_usize == 0 {
200 return None;
201 }
202 let n = n_usize as u8;
203 let (qq, eks) = buffer.debug_split_at(n_usize);
204 debug_assert_eq!(qq.len(), eks.len());
205 let l1 = f1(key, *p, n) as usize;
206 let q = debug_unwrap!(qq.get(l1), return None);
207 let l2 = f2(key, *q, n) as usize;
208 let ek = debug_unwrap!(eks.get(l2), return None);
209 if *ek == key {
210 Some(l2)
211 } else {
212 None
213 }
214 }
215 /// This is called `num_items` because `len` is ambiguous: it could refer
216 /// to the number of items or the number of bytes.
num_items(&self) -> usize217 pub fn num_items(&self) -> usize {
218 self.0.as_ref().len() / 2
219 }
220 /// Get an iterator over the keys in the order in which they are stored in the map.
keys(&self) -> &[u8]221 pub fn keys(&self) -> &[u8] {
222 let n = self.num_items();
223 self.0.as_ref().debug_split_at(1 + n).1
224 }
225 /// Diagnostic function that returns `p` and the maximum value of `q`
226 #[cfg(test)]
p_qmax(&self) -> Option<(u8, u8)>227 pub fn p_qmax(&self) -> Option<(u8, u8)> {
228 let (p, buffer) = self.0.as_ref().split_first()?;
229 let n = buffer.len() / 2;
230 if n == 0 {
231 return None;
232 }
233 let (qq, _) = buffer.debug_split_at(n);
234 Some((*p, *qq.iter().max().unwrap()))
235 }
236 /// Returns the map as bytes. The map can be recovered with [`Self::from_store`]
237 /// or [`Self::from_bytes`].
as_bytes(&self) -> &[u8]238 pub fn as_bytes(&self) -> &[u8] {
239 self.0.as_ref()
240 }
241
242 #[cfg(all(feature = "alloc", test))]
check(&self) -> Result<(), (&'static str, u8)>243 pub(crate) fn check(&self) -> Result<(), (&'static str, u8)> {
244 use alloc::vec;
245 let len = self.num_items();
246 let mut seen = vec![false; len];
247 for b in 0..=255u8 {
248 let get_result = self.get(b);
249 if self.keys().contains(&b) {
250 let i = get_result.ok_or(("expected to find", b))?;
251 if seen[i] {
252 return Err(("seen", b));
253 }
254 seen[i] = true;
255 } else if get_result.is_some() {
256 return Err(("did not expect to find", b));
257 }
258 }
259 Ok(())
260 }
261 }
262
263 impl PerfectByteHashMap<[u8]> {
264 /// Creates an instance from pre-existing bytes. See [`Self::as_bytes`].
265 #[inline]
from_bytes(bytes: &[u8]) -> &Self266 pub fn from_bytes(bytes: &[u8]) -> &Self {
267 // Safety: Self is repr(transparent) over [u8]
268 unsafe { core::mem::transmute(bytes) }
269 }
270 }
271
272 impl<Store> PerfectByteHashMap<Store>
273 where
274 Store: AsRef<[u8]> + ?Sized,
275 {
276 /// Converts from `PerfectByteHashMap<AsRef<[u8]>>` to `&PerfectByteHashMap<[u8]>`
277 #[inline]
as_borrowed(&self) -> &PerfectByteHashMap<[u8]>278 pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> {
279 PerfectByteHashMap::from_bytes(self.0.as_ref())
280 }
281 }
282
283 #[cfg(all(test, feature = "alloc"))]
284 mod tests {
285 use super::*;
286 use alloc::vec::Vec;
287 extern crate std;
288
random_alphanums(seed: u64, len: usize) -> Vec<u8>289 fn random_alphanums(seed: u64, len: usize) -> Vec<u8> {
290 use rand::seq::SliceRandom;
291 use rand::SeedableRng;
292 const BYTES: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
293 let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed);
294 BYTES.choose_multiple(&mut rng, len).copied().collect()
295 }
296
297 #[test]
test_smaller()298 fn test_smaller() {
299 let mut count_by_p = [0; 256];
300 let mut count_by_qmax = [0; 256];
301 for len in 1..16 {
302 for seed in 0..150 {
303 let keys = random_alphanums(seed, len);
304 let keys_str = core::str::from_utf8(&keys).unwrap();
305 let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
306 computed
307 .check()
308 .unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
309 let (p, qmax) = computed.p_qmax().unwrap();
310 count_by_p[p as usize] += 1;
311 count_by_qmax[qmax as usize] += 1;
312 }
313 }
314 std::println!("count_by_p (smaller): {count_by_p:?}");
315 std::println!("count_by_qmax (smaller): {count_by_qmax:?}");
316 let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
317 let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
318 .iter()
319 .sum::<usize>();
320 std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
321 // Assert that 99% of cases resolve to the fast hash
322 assert!(count_fastq >= count_slowq * 100);
323 }
324
325 #[test]
test_larger()326 fn test_larger() {
327 let mut count_by_p = [0; 256];
328 let mut count_by_qmax = [0; 256];
329 for len in 16..60 {
330 for seed in 0..75 {
331 let keys = random_alphanums(seed, len);
332 let keys_str = core::str::from_utf8(&keys).unwrap();
333 let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
334 computed
335 .check()
336 .unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
337 let (p, qmax) = computed.p_qmax().unwrap();
338 count_by_p[p as usize] += 1;
339 count_by_qmax[qmax as usize] += 1;
340 }
341 }
342 std::println!("count_by_p (larger): {count_by_p:?}");
343 std::println!("count_by_qmax (larger): {count_by_qmax:?}");
344 let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
345 let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
346 .iter()
347 .sum::<usize>();
348 std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
349 // Assert that 99% of cases resolve to the fast hash
350 assert!(count_fastq >= count_slowq * 100);
351 }
352
353 #[test]
test_hard_cases()354 fn test_hard_cases() {
355 let keys = [
356 0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
357 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
358 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
359 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
360 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
361 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
362 126, 195, 196,
363 ];
364
365 let computed = PerfectByteHashMap::try_new(&keys).unwrap();
366 let (p, qmax) = computed.p_qmax().unwrap();
367 assert_eq!(p, 69);
368 assert_eq!(qmax, 67);
369 }
370
371 #[test]
test_build_read_small()372 fn test_build_read_small() {
373 #[derive(Debug)]
374 struct TestCase<'a> {
375 keys: &'a str,
376 expected: &'a [u8],
377 reordered_keys: &'a str,
378 }
379 let cases = [
380 TestCase {
381 keys: "ab",
382 expected: &[0, 0, 0, b'b', b'a'],
383 reordered_keys: "ba",
384 },
385 TestCase {
386 keys: "abc",
387 expected: &[0, 0, 0, 0, b'c', b'a', b'b'],
388 reordered_keys: "cab",
389 },
390 TestCase {
391 // Note: splitting "a" and "c" into different buckets requires the heavier hash
392 // function because the difference between "a" and "c" is the period (2).
393 keys: "ac",
394 expected: &[1, 0, 1, b'c', b'a'],
395 reordered_keys: "ca",
396 },
397 TestCase {
398 keys: "aceg",
399 expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'],
400 reordered_keys: "eacg",
401 },
402 TestCase {
403 keys: "abd",
404 expected: &[0, 0, 1, 3, b'a', b'b', b'd'],
405 reordered_keys: "abd",
406 },
407 TestCase {
408 keys: "def",
409 expected: &[0, 0, 0, 0, b'f', b'd', b'e'],
410 reordered_keys: "fde",
411 },
412 TestCase {
413 keys: "fi",
414 expected: &[0, 0, 0, b'f', b'i'],
415 reordered_keys: "fi",
416 },
417 TestCase {
418 keys: "gh",
419 expected: &[0, 0, 0, b'h', b'g'],
420 reordered_keys: "hg",
421 },
422 TestCase {
423 keys: "lm",
424 expected: &[0, 0, 0, b'l', b'm'],
425 reordered_keys: "lm",
426 },
427 TestCase {
428 // Note: "a" and "q" (0x61 and 0x71) are very hard to split; only a handful of
429 // hash function crates can get them into separate buckets.
430 keys: "aq",
431 expected: &[4, 0, 1, b'a', b'q'],
432 reordered_keys: "aq",
433 },
434 TestCase {
435 keys: "xy",
436 expected: &[0, 0, 0, b'x', b'y'],
437 reordered_keys: "xy",
438 },
439 TestCase {
440 keys: "xyz",
441 expected: &[0, 0, 0, 0, b'x', b'y', b'z'],
442 reordered_keys: "xyz",
443 },
444 TestCase {
445 keys: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
446 expected: &[
447 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 16,
448 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
449 2, 0, 7, 104, 105, 106, 107, 108, 109, 110, 111, 112, 117, 118, 119, 68, 69,
450 70, 113, 114, 65, 66, 67, 120, 121, 122, 115, 72, 73, 74, 71, 80, 81, 82, 83,
451 84, 85, 86, 87, 88, 89, 90, 75, 76, 77, 78, 79, 103, 97, 98, 99, 116, 100, 102,
452 101,
453 ],
454 reordered_keys: "hijklmnopuvwDEFqrABCxyzsHIJGPQRSTUVWXYZKLMNOgabctdfe",
455 },
456 TestCase {
457 keys: "abcdefghij",
458 expected: &[
459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 97, 98, 99,
460 ],
461 reordered_keys: "defghijabc",
462 },
463 TestCase {
464 // This is a small case that resolves to the slow hasher
465 keys: "Jbej",
466 expected: &[2, 0, 0, 102, 0, b'j', b'e', b'b', b'J'],
467 reordered_keys: "jebJ",
468 },
469 TestCase {
470 // This is another small case that resolves to the slow hasher
471 keys: "JFNv",
472 expected: &[1, 98, 0, 2, 0, b'J', b'F', b'N', b'v'],
473 reordered_keys: "JFNv",
474 },
475 ];
476 for cas in cases {
477 let computed = PerfectByteHashMap::try_new(cas.keys.as_bytes()).expect(cas.keys);
478 assert_eq!(computed.as_bytes(), cas.expected, "{:?}", cas);
479 assert_eq!(computed.keys(), cas.reordered_keys.as_bytes(), "{:?}", cas);
480 computed.check().expect(cas.keys);
481 }
482 }
483 }
484