1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 use litemap::LiteMap;
6 use zerotrie::ZeroTriePerfectHash;
7 use zerotrie::ZeroTrieSimpleAscii;
8
9 mod testdata {
10 include!("data/data.rs");
11 }
12
13 use testdata::strings_to_litemap;
14
15 const NON_EXISTENT_STRINGS: &[&str] = &[
16 "a9PS", "ahsY", "ahBO", "a8IN", "xk8o", "xv1l", "xI2S", "618y", "d6My", "uszy",
17 ];
18
19 macro_rules! assert_bytes_eq {
20 ($len:literal, $a:expr, $b:expr) => {
21 assert_eq!($len, $a.len());
22 assert_eq!($a, $b);
23 };
24 }
25
check_simple_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTrieSimpleAscii<S>) where S: AsRef<[u8]> + ?Sized,26 fn check_simple_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTrieSimpleAscii<S>)
27 where
28 S: AsRef<[u8]> + ?Sized,
29 {
30 // Check that each item is in the trie
31 for (k, v) in items.iter() {
32 assert_eq!(trie.get(k), Some(*v));
33 }
34 // Check that some items are not in the trie
35 for s in NON_EXISTENT_STRINGS.iter() {
36 assert_eq!(trie.get(s.as_bytes()), None);
37 }
38 // Check that the iterator returns items in the same order as the LiteMap
39 assert!(items
40 .iter()
41 .map(|(s, v)| (String::from_utf8(s.to_vec()).unwrap(), *v))
42 .eq(trie.iter()));
43 // Check that the const builder works
44 let const_trie = ZeroTrieSimpleAscii::try_from_litemap_with_const_builder(items).unwrap();
45 assert_eq!(trie.as_bytes(), const_trie.as_bytes());
46 }
47
check_phf_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>) where S: AsRef<[u8]> + ?Sized,48 fn check_phf_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>)
49 where
50 S: AsRef<[u8]> + ?Sized,
51 {
52 // Check that each item is in the trie
53 for (k, v) in items.iter() {
54 assert_eq!(trie.get(k), Some(*v));
55 }
56 // Check that some items are not in the trie
57 for s in NON_EXISTENT_STRINGS.iter() {
58 assert_eq!(trie.get(s.as_bytes()), None);
59 }
60 // Check that the iterator returns the contents of the LiteMap
61 // Note: Since the items might not be in order, we collect them into a new LiteMap
62 let recovered_items: LiteMap<_, _> = trie.iter().collect();
63 assert_eq!(
64 items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(),
65 recovered_items.to_borrowed_keys_values()
66 );
67 }
68
check_phf_bytes_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>) where S: AsRef<[u8]> + ?Sized,69 fn check_phf_bytes_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>)
70 where
71 S: AsRef<[u8]> + ?Sized,
72 {
73 // Check that each item is in the trie
74 for (k, v) in items.iter() {
75 assert_eq!(trie.get(k), Some(*v), "{k:?}");
76 }
77 // Check that some items are not in the trie
78 for s in NON_EXISTENT_STRINGS.iter() {
79 assert_eq!(trie.get(s.as_bytes()), None, "{s:?}");
80 }
81 // Check that the iterator returns the contents of the LiteMap
82 // Note: Since the items might not be in order, we collect them into a new LiteMap
83 let recovered_items: LiteMap<_, _> = trie.iter().collect();
84 assert_eq!(
85 items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(),
86 recovered_items.to_borrowed_keys_values()
87 );
88 }
89
90 #[test]
test_basic()91 fn test_basic() {
92 let lm1a: LiteMap<&[u8], usize> = testdata::basic::DATA_ASCII.iter().copied().collect();
93 let lm1b: LiteMap<&[u8], usize> = lm1a.to_borrowed_keys();
94 let lm2: LiteMap<&[u8], usize> = testdata::basic::DATA_UNICODE.iter().copied().collect();
95 let lm3: LiteMap<&[u8], usize> = testdata::basic::DATA_BINARY.iter().copied().collect();
96
97 let expected_bytes = testdata::basic::TRIE_ASCII;
98 let trie = ZeroTrieSimpleAscii::try_from(&lm1a).unwrap();
99 assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
100 check_simple_ascii_trie(&lm1a, &trie);
101
102 let trie = ZeroTriePerfectHash::try_from(&lm1b).unwrap();
103 assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
104 check_phf_ascii_trie(&lm1a, &trie);
105
106 let expected_bytes = testdata::basic::TRIE_UNICODE;
107 let trie = ZeroTriePerfectHash::try_from(&lm2).unwrap();
108 assert_bytes_eq!(39, trie.as_bytes(), expected_bytes);
109 check_phf_bytes_trie(&lm2, &trie);
110
111 let expected_bytes = testdata::basic::TRIE_BINARY;
112 let trie = ZeroTriePerfectHash::try_from(&lm3).unwrap();
113 assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
114 check_phf_bytes_trie(&lm3, &trie);
115 }
116
117 #[test]
test_empty()118 fn test_empty() {
119 let trie = ZeroTrieSimpleAscii::try_from(&LiteMap::<&[u8], usize>::new_vec()).unwrap();
120 assert_eq!(trie.byte_len(), 0);
121 assert!(trie.is_empty());
122 assert_eq!(trie.get(b""), None);
123 assert_eq!(trie.as_bytes(), &[]);
124 }
125
126 #[test]
test_single_empty_value()127 fn test_single_empty_value() {
128 let litemap: LiteMap<&[u8], usize> = [
129 (&b""[..], 10), //
130 ]
131 .into_iter()
132 .collect();
133 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
134 assert_eq!(trie.get(b""), Some(10));
135 assert_eq!(trie.get(b"x"), None);
136 let expected_bytes = &[0b10001010];
137 assert_eq!(trie.as_bytes(), expected_bytes);
138
139 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
140 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
141 assert_bytes_eq!(1, trie_phf.as_bytes(), expected_bytes);
142 check_phf_ascii_trie(&litemap, &trie_phf);
143 }
144
145 #[test]
test_single_byte_string()146 fn test_single_byte_string() {
147 let litemap: LiteMap<&[u8], usize> = [
148 (&b"x"[..], 10), //
149 ]
150 .into_iter()
151 .collect();
152 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
153 assert_eq!(trie.get(b""), None);
154 assert_eq!(trie.get(b"xy"), None);
155 check_simple_ascii_trie(&litemap, &trie);
156 let expected_bytes = &[b'x', 0b10001010];
157 assert_bytes_eq!(2, trie.as_bytes(), expected_bytes);
158
159 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
160 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
161 assert_bytes_eq!(2, trie_phf.as_bytes(), expected_bytes);
162 check_phf_ascii_trie(&litemap, &trie_phf);
163 }
164
165 #[test]
test_single_string()166 fn test_single_string() {
167 let litemap: LiteMap<&[u8], usize> = [
168 (&b"xyz"[..], 10), //
169 ]
170 .into_iter()
171 .collect();
172 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
173 assert_eq!(trie.get(b""), None);
174 assert_eq!(trie.get(b"x"), None);
175 assert_eq!(trie.get(b"xy"), None);
176 assert_eq!(trie.get(b"xyzz"), None);
177 check_simple_ascii_trie(&litemap, &trie);
178 let expected_bytes = &[b'x', b'y', b'z', 0b10001010];
179 assert_bytes_eq!(4, trie.as_bytes(), expected_bytes);
180
181 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
182 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
183 assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes);
184 check_phf_ascii_trie(&litemap, &trie_phf);
185 }
186
187 #[test]
test_prefix_strings()188 fn test_prefix_strings() {
189 let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"xy", 1)].into_iter().collect();
190 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
191 assert_eq!(trie.get(b""), None);
192 assert_eq!(trie.get(b"xyz"), None);
193 check_simple_ascii_trie(&litemap, &trie);
194 let expected_bytes = &[b'x', 0b10000000, b'y', 0b10000001];
195 assert_bytes_eq!(4, trie.as_bytes(), expected_bytes);
196
197 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
198 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
199 assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes);
200 check_phf_ascii_trie(&litemap, &trie_phf);
201 }
202
203 #[test]
test_single_byte_branch()204 fn test_single_byte_branch() {
205 let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"y", 1)].into_iter().collect();
206 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
207 assert_eq!(trie.get(b""), None);
208 assert_eq!(trie.get(b"xy"), None);
209 check_simple_ascii_trie(&litemap, &trie);
210 let expected_bytes = &[0b11000010, b'x', b'y', 1, 0b10000000, 0b10000001];
211 assert_bytes_eq!(6, trie.as_bytes(), expected_bytes);
212
213 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
214 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
215 assert_bytes_eq!(6, trie_phf.as_bytes(), expected_bytes);
216 check_phf_ascii_trie(&litemap, &trie_phf);
217 }
218
219 #[test]
test_multi_byte_branch()220 fn test_multi_byte_branch() {
221 let litemap: LiteMap<&[u8], usize> = [(&b"axb"[..], 0), (b"ayc", 1)].into_iter().collect();
222 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
223 assert_eq!(trie.get(b""), None);
224 assert_eq!(trie.get(b"a"), None);
225 assert_eq!(trie.get(b"ax"), None);
226 assert_eq!(trie.get(b"ay"), None);
227 check_simple_ascii_trie(&litemap, &trie);
228 let expected_bytes = &[
229 b'a', 0b11000010, b'x', b'y', 2, b'b', 0b10000000, b'c', 0b10000001,
230 ];
231 assert_bytes_eq!(9, trie.as_bytes(), expected_bytes);
232
233 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
234 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
235 assert_bytes_eq!(9, trie_phf.as_bytes(), expected_bytes);
236 check_phf_ascii_trie(&litemap, &trie_phf);
237 }
238
239 #[test]
test_linear_varint_values()240 fn test_linear_varint_values() {
241 let litemap: LiteMap<&[u8], usize> = [(&b""[..], 100), (b"x", 500), (b"xyz", 5000)]
242 .into_iter()
243 .collect();
244 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
245 assert_eq!(trie.get(b"xy"), None);
246 assert_eq!(trie.get(b"xz"), None);
247 assert_eq!(trie.get(b"xyzz"), None);
248 check_simple_ascii_trie(&litemap, &trie);
249 let expected_bytes = &[0x90, 0x54, b'x', 0x93, 0x64, b'y', b'z', 0x90, 0x96, 0x78];
250 assert_bytes_eq!(10, trie.as_bytes(), expected_bytes);
251
252 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
253 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
254 assert_bytes_eq!(10, trie_phf.as_bytes(), expected_bytes);
255 check_phf_ascii_trie(&litemap, &trie_phf);
256 }
257
258 #[test]
test_bug()259 fn test_bug() {
260 let litemap: LiteMap<&[u8], usize> = [(&b"abc"[..], 100), (b"abcd", 500), (b"abcde", 5000)]
261 .into_iter()
262 .collect();
263 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
264 assert_eq!(trie.get(b"ab"), None);
265 assert_eq!(trie.get(b"abd"), None);
266 assert_eq!(trie.get(b"abCD"), None);
267 check_simple_ascii_trie(&litemap, &trie);
268
269 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
270 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
271 check_phf_ascii_trie(&litemap, &trie_phf);
272 }
273
274 #[test]
test_varint_branch()275 fn test_varint_branch() {
276 let chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
277 let litemap: LiteMap<&[u8], usize> = (0..chars.len())
278 .map(|i| (chars.get(i..i + 1).unwrap().as_bytes(), i))
279 .collect();
280 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
281 assert_eq!(trie.get(b""), None);
282 assert_eq!(trie.get(b"ax"), None);
283 assert_eq!(trie.get(b"ay"), None);
284 check_simple_ascii_trie(&litemap, &trie);
285 #[rustfmt::skip]
286 let expected_bytes = &[
287 0b11100000, // branch varint lead
288 0x14, // branch varint trail
289 // search array:
290 b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J',
291 b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T',
292 b'U', b'V', b'W', b'X', b'Y', b'Z',
293 b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
294 b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
295 b'u', b'v', b'w', b'x', b'y', b'z',
296 // offset array:
297 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20,
298 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52,
299 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84,
300 86,
301 // single-byte values:
302 0x80, (0x80 | 1), (0x80 | 2), (0x80 | 3), (0x80 | 4),
303 (0x80 | 5), (0x80 | 6), (0x80 | 7), (0x80 | 8), (0x80 | 9),
304 (0x80 | 10), (0x80 | 11), (0x80 | 12), (0x80 | 13), (0x80 | 14),
305 (0x80 | 15),
306 // multi-byte values:
307 0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5,
308 0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x90, 10, 0x90, 11,
309 0x90, 12, 0x90, 13, 0x90, 14, 0x90, 15, 0x90, 16, 0x90, 17,
310 0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23,
311 0x90, 24, 0x90, 25, 0x90, 26, 0x90, 27, 0x90, 28, 0x90, 29,
312 0x90, 30, 0x90, 31, 0x90, 32, 0x90, 33, 0x90, 34, 0x90, 35,
313 ];
314 assert_bytes_eq!(193, trie.as_bytes(), expected_bytes);
315
316 #[rustfmt::skip]
317 let expected_bytes = &[
318 0b11100000, // branch varint lead
319 0x14, // branch varint trail
320 // PHF metadata:
321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8,
322 4, 4, 4, 16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 7,
324 // search array:
325 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
326 b'p', b'u', b'v', b'w', b'D', b'E', b'F', b'q',
327 b'r', b'A', b'B', b'C', b'x', b'y', b'z', b's',
328 b'H', b'I', b'J', b'G', b'P', b'Q', b'R', b'S',
329 b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'K',
330 b'L', b'M', b'N', b'O', b'g', b'a', b'b', b'c',
331 b't', b'd', b'f', b'e',
332 // offset array:
333 2, 4, 6, 8, 10, 12, 14,
334 16, 18, 20, 22, 24, 25, 26, 27,
335 29, 31, 32, 33, 34, 36, 38, 40,
336 42, 43, 44, 45, 46, 47, 49, 51,
337 53, 55, 57, 59, 61, 63, 65, 67,
338 68, 69, 70, 71, 72, 74, 76, 78,
339 80, 82, 84, 86,
340 // values:
341 0x90, 17, 0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23,
342 0x90, 24, 0x90, 25, 0x90, 30, 0x90, 31, 0x90, 32, 0x80 | 3, 0x80 | 4,
343 0x80 | 5, 0x90, 26, 0x90, 27, 0x80, 0x80 | 1, 0x80 | 2, 0x90, 33,
344 0x90, 34, 0x90, 35, 0x90, 28, 0x80 | 7, 0x80 | 8, 0x80 | 9, 0x80 | 6,
345 0x80 | 15, 0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5,
346 0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x80 | 10, 0x80 | 11, 0x80 | 12,
347 0x80 | 13, 0x80 | 14, 0x90, 16, 0x90, 10, 0x90, 11, 0x90, 12, 0x90, 29,
348 0x90, 13, 0x90, 15, 0x90, 14,
349 ];
350 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
351 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
352 assert_bytes_eq!(246, trie_phf.as_bytes(), expected_bytes);
353 check_phf_ascii_trie(&litemap, &trie_phf);
354 }
355
356 #[test]
test_below_wide()357 fn test_below_wide() {
358 let litemap: LiteMap<&[u8], usize> = [
359 (&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
360 (b"bcdefghijklmnopqrstuvwxyza", 2),
361 (b"cdefghijklmnopqrstuvwxyzab", 3),
362 (b"defghijklmnopqrstuvwxyzabc", 4),
363 (b"efghijklmnopqrstuvwxyzabcd", 5),
364 (b"fghijklmnopqrstuvwxyzabcde", 6),
365 (b"ghijklmnopqrstuvwxyzabcdef", 7),
366 (b"hijklmnopqrstuvwxyzabcdefg", 8),
367 (b"ijklmnopqrstuvwxyzabcdefgh", 9),
368 (b"jklmnopqrstuvwxyzabcd", 10),
369 ]
370 .into_iter()
371 .collect();
372 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
373 assert_eq!(trie.get(b""), None);
374 assert_eq!(trie.get(b"abc"), None);
375 check_simple_ascii_trie(&litemap, &trie);
376 #[rustfmt::skip]
377 let expected_bytes = &[
378 0b11001010, // branch
379 // search array:
380 b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
381 // offset array:
382 26, 52, 78, 104, 130, 156, 182, 208, 234,
383 // offset data:
384 b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
385 b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
386 0x81,
387 b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
388 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
389 0x82,
390 b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
391 b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
392 0x83,
393 b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
394 b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
395 0x84,
396 b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
397 b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
398 0x85,
399 b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
400 b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
401 0x86,
402 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
403 b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
404 0x87,
405 b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
406 b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
407 0x88,
408 b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
409 b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
410 0x89,
411 b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
412 b'x', b'y', b'z', b'a', b'b', b'c', b'd',
413 0x8A,
414 ];
415 assert_bytes_eq!(275, trie.as_bytes(), expected_bytes);
416 }
417
418 #[test]
test_at_wide()419 fn test_at_wide() {
420 let litemap: LiteMap<&[u8], usize> = [
421 (&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
422 (b"bcdefghijklmnopqrstuvwxyza", 2),
423 (b"cdefghijklmnopqrstuvwxyzab", 3),
424 (b"defghijklmnopqrstuvwxyzabc", 4),
425 (b"efghijklmnopqrstuvwxyzabcd", 5),
426 (b"fghijklmnopqrstuvwxyzabcde", 6),
427 (b"ghijklmnopqrstuvwxyzabcdef", 7),
428 (b"hijklmnopqrstuvwxyzabcdefg", 8),
429 (b"ijklmnopqrstuvwxyzabcdefgh", 9),
430 (b"jklmnopqrstuvwxyzabcde", 10),
431 ]
432 .into_iter()
433 .collect();
434 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
435 assert_eq!(trie.get(b""), None);
436 assert_eq!(trie.get(b"abc"), None);
437 check_simple_ascii_trie(&litemap, &trie);
438 #[rustfmt::skip]
439 let expected_bytes = &[
440 0b11100001, // branch lead
441 0x6A, // branch trail
442 // search array:
443 b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
444 // offset array (wide):
445 0, 0, 0, 0, 0, 0, 0, 0, 0,
446 26, 52, 78, 104, 130, 156, 182, 208, 234,
447 // offset data:
448 b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
449 b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
450 0x81,
451 b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
452 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
453 0x82,
454 b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
455 b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
456 0x83,
457 b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
458 b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
459 0x84,
460 b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
461 b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
462 0x85,
463 b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
464 b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
465 0x86,
466 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
467 b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
468 0x87,
469 b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
470 b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
471 0x88,
472 b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
473 b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
474 0x89,
475 b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
476 b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
477 0x8A,
478 ];
479 assert_bytes_eq!(286, trie.as_bytes(), expected_bytes);
480 }
481
482 #[test]
test_at_wide_plus()483 fn test_at_wide_plus() {
484 let litemap: LiteMap<&[u8], usize> = [
485 (&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
486 (b"bcdefghijklmnopqrstuvwxyza", 2),
487 (b"cdefghijklmnopqrstuvwxyzab", 3),
488 (b"defghijklmnopqrstuvwxyzabc", 4),
489 (b"efghijklmnopqrstuvwxyzabcd", 5),
490 (b"fghijklmnopqrstuvwxyzabcde", 6),
491 (b"ghijklmnopqrstuvwxyzabcdef", 7),
492 (b"hijklmnopqrstuvwxyzabcdefg", 8),
493 (b"ijklmnopqrstuvwxyzabcdefgh", 9),
494 (b"jklmnopqrstuvwxyzabcdef", 10),
495 ]
496 .into_iter()
497 .collect();
498 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
499 assert_eq!(trie.get(b""), None);
500 assert_eq!(trie.get(b"abc"), None);
501 check_simple_ascii_trie(&litemap, &trie);
502 #[rustfmt::skip]
503 let expected_bytes = &[
504 0b11100001, // branch lead
505 0x6A, // branch trail
506 // search array:
507 b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
508 // offset array (wide):
509 0, 0, 0, 0, 0, 0, 0, 0, 0,
510 26, 52, 78, 104, 130, 156, 182, 208, 234,
511 // offset data:
512 b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
513 b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
514 0x81,
515 b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
516 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
517 0x82,
518 b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
519 b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
520 0x83,
521 b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
522 b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
523 0x84,
524 b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
525 b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
526 0x85,
527 b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
528 b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
529 0x86,
530 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
531 b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
532 0x87,
533 b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
534 b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
535 0x88,
536 b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
537 b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
538 0x89,
539 b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
540 b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
541 0x8A,
542 ];
543 assert_bytes_eq!(287, trie.as_bytes(), expected_bytes);
544 }
545
546 #[test]
test_everything()547 fn test_everything() {
548 let litemap: LiteMap<&[u8], usize> = [
549 (&b""[..], 0),
550 (b"axb", 100),
551 (b"ayc", 2),
552 (b"azd", 3),
553 (b"bxe", 4),
554 (b"bxefg", 500),
555 (b"bxefh", 6),
556 (b"bxei", 7),
557 (b"bxeikl", 8),
558 ]
559 .into_iter()
560 .collect();
561 let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
562 assert_eq!(trie.get(b""), Some(0));
563 assert_eq!(trie.get(b"a"), None);
564 assert_eq!(trie.get(b"ax"), None);
565 assert_eq!(trie.get(b"ay"), None);
566 check_simple_ascii_trie(&litemap, &trie);
567 let expected_bytes = &[
568 0b10000000, // value 0
569 0b11000010, // branch of 2
570 b'a', //
571 b'b', //
572 13, //
573 0b11000011, // branch of 3
574 b'x', //
575 b'y', //
576 b'z', //
577 3, //
578 5, //
579 b'b', //
580 0b10010000, // value 100 (lead)
581 0x54, // value 100 (trail)
582 b'c', //
583 0b10000010, // value 2
584 b'd', //
585 0b10000011, // value 3
586 b'x', //
587 b'e', //
588 0b10000100, // value 4
589 0b11000010, // branch of 2
590 b'f', //
591 b'i', //
592 7, //
593 0b11000010, // branch of 2
594 b'g', //
595 b'h', //
596 2, //
597 0b10010011, // value 500 (lead)
598 0x64, // value 500 (trail)
599 0b10000110, // value 6
600 0b10000111, // value 7
601 b'k', //
602 b'l', //
603 0b10001000, // value 8
604 ];
605 assert_bytes_eq!(36, trie.as_bytes(), expected_bytes);
606
607 #[rustfmt::skip]
608 let expected_bytes = &[
609 0b10000000, // value 0
610 0b11000010, // branch of 2
611 b'a', //
612 b'b', //
613 13, //
614 0b11000011, // start of 'a' subtree: branch of 3
615 b'x', //
616 b'y', //
617 b'z', //
618 3, //
619 5, //
620 b'b', //
621 0b10010000, // value 100 (lead)
622 0x54, // value 100 (trail)
623 b'c', //
624 0b10000010, // value 2
625 b'd', //
626 0b10000011, // value 3
627 b'x', // start of 'b' subtree
628 b'e', //
629 0b10000100, // value 4
630 0b11000010, // branch of 2
631 b'f', //
632 b'i', //
633 7, //
634 0b11000010, // branch of 2
635 b'g', //
636 b'h', //
637 2, //
638 0b10010011, // value 500 (lead)
639 0x64, // value 500 (trail)
640 0b10000110, // value 6
641 0b10000111, // value 7
642 b'k', //
643 b'l', //
644 0b10001000, // value 8
645 ];
646 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
647 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
648 assert_bytes_eq!(36, trie_phf.as_bytes(), expected_bytes);
649 check_phf_ascii_trie(&litemap, &trie_phf);
650
651 let zhm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
652 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
653 assert_eq!(zhm_buf.len(), 88);
654
655 let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
656 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
657 assert_eq!(zhm_buf.len(), 61);
658
659 let zhm: zerovec::ZeroHashMap<[u8], u32> =
660 litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
661 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
662 assert_eq!(zhm_buf.len(), 161);
663
664 let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
665 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
666 assert_eq!(zhm_buf.len(), 134);
667 }
668
669 macro_rules! utf8_byte {
670 ($ch:expr, $i:literal) => {{
671 let mut utf8_encoder_buf = [0u8; 4];
672 $ch.encode_utf8(&mut utf8_encoder_buf);
673 utf8_encoder_buf[$i]
674 }};
675 }
676
677 #[test]
test_non_ascii()678 fn test_non_ascii() {
679 let litemap: LiteMap<&[u8], usize> = [
680 ("".as_bytes(), 0),
681 ("axb".as_bytes(), 100),
682 ("ayc".as_bytes(), 2),
683 ("azd".as_bytes(), 3),
684 ("bxe".as_bytes(), 4),
685 ("bxefg".as_bytes(), 500),
686 ("bxefh".as_bytes(), 6),
687 ("bxei".as_bytes(), 7),
688 ("bxeikl".as_bytes(), 8),
689 ("bxeiklmΚαλημέρααα".as_bytes(), 9),
690 ("bxeiklmαnλo".as_bytes(), 10),
691 ("bxeiklmη".as_bytes(), 11),
692 ]
693 .into_iter()
694 .collect();
695
696 #[rustfmt::skip]
697 let expected_bytes = &[
698 0b10000000, // value 0
699 0b11000010, // branch of 2
700 b'a', //
701 b'b', //
702 13, //
703 0b11000011, // start of 'a' subtree: branch of 3
704 b'x', //
705 b'y', //
706 b'z', //
707 3, //
708 5, //
709 b'b', //
710 0b10010000, // value 100 (lead)
711 0x54, // value 100 (trail)
712 b'c', //
713 0b10000010, // value 2
714 b'd', //
715 0b10000011, // value 3
716 b'x', // start of 'b' subtree
717 b'e', //
718 0b10000100, // value 4
719 0b11000010, // branch of 2
720 b'f', //
721 b'i', //
722 7, //
723 0b11000010, // branch of 2
724 b'g', //
725 b'h', //
726 2, //
727 0b10010011, // value 500 (lead)
728 0x64, // value 500 (trail)
729 0b10000110, // value 6
730 0b10000111, // value 7
731 b'k', //
732 b'l', //
733 0b10001000, // value 8
734 b'm', //
735 0b10100001, // span of length 1
736 utf8_byte!('Κ', 0), // NOTE: all three letters have the same lead byte
737 0b11000011, // branch of 3
738 utf8_byte!('Κ', 1),
739 utf8_byte!('α', 1),
740 utf8_byte!('η', 1),
741 21,
742 27,
743 0b10110000, // span of length 18 (lead)
744 0b00000010, // span of length 18 (trail)
745 utf8_byte!('α', 0),
746 utf8_byte!('α', 1),
747 utf8_byte!('λ', 0),
748 utf8_byte!('λ', 1),
749 utf8_byte!('η', 0),
750 utf8_byte!('η', 1),
751 utf8_byte!('μ', 0),
752 utf8_byte!('μ', 1),
753 utf8_byte!('έ', 0),
754 utf8_byte!('έ', 1),
755 utf8_byte!('ρ', 0),
756 utf8_byte!('ρ', 1),
757 utf8_byte!('α', 0),
758 utf8_byte!('α', 1),
759 utf8_byte!('α', 0),
760 utf8_byte!('α', 1),
761 utf8_byte!('α', 0),
762 utf8_byte!('α', 1),
763 0b10001001, // value 9
764 b'n',
765 0b10100010, // span of length 2
766 utf8_byte!('λ', 0),
767 utf8_byte!('λ', 1),
768 b'o',
769 0b10001010, // value 10
770 0b10001011, // value 11
771 ];
772 let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap();
773 assert_bytes_eq!(73, trie_phf.as_bytes(), expected_bytes);
774 check_phf_bytes_trie(&litemap, &trie_phf);
775 }
776
777 #[test]
test_max_branch()778 fn test_max_branch() {
779 // Evaluate a branch with all 256 possible children
780 let mut litemap: LiteMap<&[u8], usize> = LiteMap::new_vec();
781 let all_bytes: Vec<u8> = (u8::MIN..=u8::MAX).collect();
782 assert_eq!(all_bytes.len(), 256);
783 let all_bytes_prefixed: Vec<[u8; 2]> = (u8::MIN..=u8::MAX).map(|x| [b'\0', x]).collect();
784 for b in all_bytes.iter() {
785 litemap.insert(core::slice::from_ref(b), *b as usize);
786 }
787 for s in all_bytes_prefixed.iter() {
788 litemap.insert(s, s[1] as usize);
789 }
790 let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap();
791 assert_eq!(trie_phf.byte_len(), 3042);
792 check_phf_bytes_trie(&litemap, &trie_phf);
793 }
794
795 #[test]
test_short_subtags_10pct()796 fn test_short_subtags_10pct() {
797 let litemap = strings_to_litemap(testdata::short_subtags_10pct::STRINGS);
798
799 let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
800 assert_eq!(trie.byte_len(), 1050);
801 check_simple_ascii_trie(&litemap, &trie);
802
803 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
804 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
805 assert_eq!(trie_phf.byte_len(), 1100);
806 check_phf_ascii_trie(&litemap, &trie_phf);
807
808 let zhm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
809 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
810 assert_eq!(zhm_buf.len(), 1890);
811
812 let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
813 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
814 assert_eq!(zhm_buf.len(), 1326);
815
816 let zhm: zerovec::ZeroHashMap<[u8], u32> =
817 litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
818 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
819 assert_eq!(zhm_buf.len(), 3396);
820
821 let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
822 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
823 assert_eq!(zhm_buf.len(), 2832);
824 }
825
826 #[test]
test_short_subtags()827 fn test_short_subtags() {
828 let litemap = strings_to_litemap(testdata::short_subtags::STRINGS);
829
830 let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
831 assert_eq!(trie.byte_len(), 8793);
832 check_simple_ascii_trie(&litemap, &trie);
833
834 let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
835 let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
836 assert_eq!(trie_phf.byte_len(), 9400);
837 check_phf_ascii_trie(&litemap, &trie_phf);
838
839 let zm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
840 let zhm_buf = postcard::to_allocvec(&zm).unwrap();
841 assert_eq!(zhm_buf.len(), 18931);
842
843 let zm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
844 let zhm_buf = postcard::to_allocvec(&zm).unwrap();
845 assert_eq!(zhm_buf.len(), 13300);
846
847 let zhm: zerovec::ZeroHashMap<[u8], u32> =
848 litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
849 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
850 assert_eq!(zhm_buf.len(), 33949);
851
852 let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
853 let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
854 assert_eq!(zhm_buf.len(), 28318);
855 }
856