• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! Varint spec for ZeroTrie:
6 //!
7 //! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
8 //! - Trail bytes: top bit is varint extender; rest are low bits of value
9 //! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
10 //! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
11 //!
12 //! This is best shown by examples.
13 //!
14 //! ```txt
15 //! xxx0'1010 = 10
16 //! xxx0'1111 = 15 (largest single-byte value with M=3)
17 //! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
18 //! xxx1'0000 0000'0001 = 17
19 //! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
20 //! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
21 //! xxx1'0000 1000'0000 0000'0001 = 2065
22 //! ```
23 //!
24 //! The latent values by number of bytes for M=3 are:
25 //!
26 //! - 1 byte: 0
27 //! - 2 bytes: 16 = 0x10 = 0b10000
28 //! - 3 bytes: 2064 = 0x810 = 0b100000010000
29 //! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
30 //! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
31 //! - …
32 //!
33 //! For M=2, the latent values are:
34 //!
35 //! - 1 byte: 0
36 //! - 2 bytes: 32 = 0x20 = 0b100000
37 //! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
38 //! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
39 //! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
40 //! - …
41 
42 use crate::builder::konst::ConstArrayBuilder;
43 
44 #[cfg(feature = "alloc")]
45 use crate::builder::nonconst::TrieBuilderStore;
46 
47 /// Reads a varint with 2 bits of metadata in the lead byte.
48 ///
49 /// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
50 ///
51 /// If the varint spills off the end of the slice, a debug assertion will fail,
52 /// and the function will return the value up to that point.
read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8])53 pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
54     let mut value = (start & 0b00011111) as usize;
55     let mut remainder = remainder;
56     if (start & 0b00100000) != 0 {
57         loop {
58             let next;
59             (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
60             // Note: value << 7 could drop high bits. The first addition can't overflow.
61             // The second addition could overflow; in such a case we just inform the
62             // developer via the debug assertion.
63             value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
64             if (*next & 0b10000000) == 0 {
65                 break;
66             }
67         }
68     }
69     (value, remainder)
70 }
71 
72 /// Reads a varint with 3 bits of metadata in the lead byte.
73 ///
74 /// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
75 ///
76 /// If the varint spills off the end of the slice, a debug assertion will fail,
77 /// and the function will return the value up to that point.
read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8])78 pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
79     let mut value = (start & 0b00001111) as usize;
80     let mut remainder = remainder;
81     if (start & 0b00010000) != 0 {
82         loop {
83             let next;
84             (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
85             // Note: value << 7 could drop high bits. The first addition can't overflow.
86             // The second addition could overflow; in such a case we just inform the
87             // developer via the debug assertion.
88             value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
89             if (*next & 0b10000000) == 0 {
90                 break;
91             }
92         }
93     }
94     (value, remainder)
95 }
96 
97 /// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
98 ///
99 /// Returns the varint value.
100 #[cfg(feature = "alloc")]
try_read_varint_meta3_from_tstore<S: TrieBuilderStore>( start: u8, remainder: &mut S, ) -> Option<usize>101 pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
102     start: u8,
103     remainder: &mut S,
104 ) -> Option<usize> {
105     let mut value = (start & 0b00001111) as usize;
106     if (start & 0b00010000) != 0 {
107         loop {
108             let next = remainder.atbs_pop_front()?;
109             // Note: value << 7 could drop high bits. The first addition can't overflow.
110             // The second addition could overflow; in such a case we just inform the
111             // developer via the debug assertion.
112             value = (value << 7) + ((next & 0b01111111) as usize) + 16;
113             if (next & 0b10000000) == 0 {
114                 break;
115             }
116         }
117     }
118     Some(value)
119 }
120 
121 #[cfg(test)]
122 const MAX_VARINT: usize = usize::MAX;
123 
124 // *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
125 // Add an extra 1 since the lead byte holds only 5 bits of data.
126 const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
127 
128 /// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8>129 pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
130     let mut result = [0; MAX_VARINT_LENGTH];
131     let mut i = MAX_VARINT_LENGTH - 1;
132     let mut value = value;
133     let mut last = true;
134     loop {
135         if value < 32 {
136             result[i] = value as u8;
137             if !last {
138                 result[i] |= 0b00100000;
139             }
140             break;
141         }
142         value -= 32;
143         result[i] = (value as u8) & 0b01111111;
144         if !last {
145             result[i] |= 0b10000000;
146         } else {
147             last = false;
148         }
149         value >>= 7;
150         i -= 1;
151     }
152     // The bytes are from i to the end.
153     ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
154 }
155 
156 /// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8>157 pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
158     let mut result = [0; MAX_VARINT_LENGTH];
159     let mut i = MAX_VARINT_LENGTH - 1;
160     let mut value = value;
161     let mut last = true;
162     loop {
163         if value < 16 {
164             result[i] = value as u8;
165             if !last {
166                 result[i] |= 0b00010000;
167             }
168             break;
169         }
170         value -= 16;
171         result[i] = (value as u8) & 0b01111111;
172         if !last {
173             result[i] |= 0b10000000;
174         } else {
175             last = false;
176         }
177         value >>= 7;
178         i -= 1;
179     }
180     // The bytes are from i to the end.
181     ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
182 }
183 
184 /// A secondary implementation that separates the latent value while computing the varint.
185 #[cfg(test)]
write_varint_reference( value: usize, ) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8>186 pub(crate) const fn write_varint_reference(
187     value: usize,
188 ) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
189     let mut result = [0; MAX_VARINT_LENGTH];
190     if value < 32 {
191         result[0] = value as u8;
192         return ConstArrayBuilder::from_manual_slice(result, 0, 1);
193     }
194     result[0] = 32;
195     let mut latent = 32;
196     let mut steps = 2;
197     loop {
198         let next_latent = (latent << 7) + 32;
199         if value < next_latent || next_latent == latent {
200             break;
201         }
202         latent = next_latent;
203         steps += 1;
204     }
205     let mut value = value - latent;
206     let mut i = steps;
207     while i > 0 {
208         i -= 1;
209         result[i] |= (value as u8) & 0b01111111;
210         value >>= 7;
211         if i > 0 && i < steps - 1 {
212             result[i] |= 0b10000000;
213         }
214     }
215     // The bytes are from 0 to `steps`.
216     ConstArrayBuilder::from_manual_slice(result, 0, steps)
217 }
218 
219 #[cfg(test)]
220 mod tests {
221     use super::*;
222 
223     #[derive(Debug)]
224     struct TestCase<'a> {
225         bytes: &'a [u8],
226         remainder: &'a [u8],
227         value: usize,
228     }
229     static CASES: &[TestCase] = &[
230         TestCase {
231             bytes: &[0b00000000],
232             remainder: &[],
233             value: 0,
234         },
235         TestCase {
236             bytes: &[0b00001010],
237             remainder: &[],
238             value: 10,
239         },
240         TestCase {
241             bytes: &[0b00011111],
242             remainder: &[],
243             value: 31,
244         },
245         TestCase {
246             bytes: &[0b00011111, 0b10101010],
247             remainder: &[0b10101010],
248             value: 31,
249         },
250         TestCase {
251             bytes: &[0b00100000, 0b00000000],
252             remainder: &[],
253             value: 32,
254         },
255         TestCase {
256             bytes: &[0b00100000, 0b00000001],
257             remainder: &[],
258             value: 33,
259         },
260         TestCase {
261             bytes: &[0b00100000, 0b00100000],
262             remainder: &[],
263             value: 64,
264         },
265         TestCase {
266             bytes: &[0x20, 0x44],
267             remainder: &[],
268             value: 100,
269         },
270         TestCase {
271             bytes: &[0b00100000, 0b01111111],
272             remainder: &[],
273             value: 159,
274         },
275         TestCase {
276             bytes: &[0b00100001, 0b00000000],
277             remainder: &[],
278             value: 160,
279         },
280         TestCase {
281             bytes: &[0b00100001, 0b00000001],
282             remainder: &[],
283             value: 161,
284         },
285         TestCase {
286             bytes: &[0x23, 0x54],
287             remainder: &[],
288             value: 500,
289         },
290         TestCase {
291             bytes: &[0b00111111, 0b01111111],
292             remainder: &[],
293             value: 4127, // 32 + (1 << 12) - 1
294         },
295         TestCase {
296             bytes: &[0b00100000, 0b10000000, 0b00000000],
297             remainder: &[],
298             value: 4128, // 32 + (1 << 12)
299         },
300         TestCase {
301             bytes: &[0b00100000, 0b10000000, 0b00000001],
302             remainder: &[],
303             value: 4129, // 32 + (1 << 12) + 1
304         },
305         TestCase {
306             bytes: &[0b00100000, 0b10000000, 0b01111111],
307             remainder: &[],
308             value: 4255, // 32 + (1 << 12) + 127
309         },
310         TestCase {
311             bytes: &[0b00100000, 0b10000001, 0b00000000],
312             remainder: &[],
313             value: 4256, // 32 + (1 << 12) + 128
314         },
315         TestCase {
316             bytes: &[0b00100000, 0b10000001, 0b00000001],
317             remainder: &[],
318             value: 4257, // 32 + (1 << 12) + 129
319         },
320         TestCase {
321             bytes: &[0x20, 0x86, 0x68],
322             remainder: &[],
323             value: 5000,
324         },
325         TestCase {
326             bytes: &[0b00100000, 0b11111111, 0b01111111],
327             remainder: &[],
328             value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
329         },
330         TestCase {
331             bytes: &[0b00100001, 0b10000000, 0b00000000],
332             remainder: &[],
333             value: 20512, // 32 + (1 << 12) + (1 << 14)
334         },
335         TestCase {
336             bytes: &[0b00111111, 0b11111111, 0b01111111],
337             remainder: &[],
338             value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
339         },
340         TestCase {
341             bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
342             remainder: &[],
343             value: 528416, // 32 + (1 << 12) + (1 << 19)
344         },
345         TestCase {
346             bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
347             remainder: &[],
348             value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
349         },
350         TestCase {
351             bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
352             remainder: &[],
353             value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
354         },
355         TestCase {
356             bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
357             remainder: &[],
358             value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
359         },
360     ];
361 
362     #[test]
test_read()363     fn test_read() {
364         for cas in CASES {
365             let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
366             assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas);
367         }
368     }
369 
370     #[test]
test_read_write()371     fn test_read_write() {
372         for cas in CASES {
373             let reference_bytes = write_varint_reference(cas.value);
374             assert_eq!(
375                 reference_bytes.len(),
376                 cas.bytes.len() - cas.remainder.len(),
377                 "{:?}",
378                 cas
379             );
380             assert_eq!(
381                 reference_bytes.as_slice(),
382                 &cas.bytes[0..reference_bytes.len()],
383                 "{:?}",
384                 cas
385             );
386             let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
387             assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas);
388             let write_bytes = write_varint_meta2(cas.value);
389             assert_eq!(
390                 reference_bytes.as_slice(),
391                 write_bytes.as_slice(),
392                 "{:?}",
393                 cas
394             );
395         }
396     }
397 
398     #[test]
test_roundtrip()399     fn test_roundtrip() {
400         let mut i = 0usize;
401         while i < MAX_VARINT {
402             let bytes = write_varint_meta2(i);
403             let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
404             assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
405             i <<= 1;
406             i += 1;
407         }
408     }
409 
410     #[test]
test_extended_roundtrip()411     fn test_extended_roundtrip() {
412         let mut i = 0usize;
413         while i < MAX_VARINT {
414             let bytes = write_varint_meta3(i);
415             let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
416             assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
417             i <<= 1;
418             i += 1;
419         }
420     }
421 
422     #[test]
test_max()423     fn test_max() {
424         let reference_bytes = write_varint_reference(MAX_VARINT);
425         let write_bytes = write_varint_meta2(MAX_VARINT);
426         assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
427         assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
428         let subarray = write_bytes
429             .as_const_slice()
430             .get_subslice_or_panic(1, write_bytes.len());
431         let (recovered_value, remainder) = read_varint_meta2(
432             *write_bytes.as_const_slice().first().unwrap(),
433             subarray.as_slice(),
434         );
435         assert!(remainder.is_empty());
436         assert_eq!(recovered_value, MAX_VARINT);
437         assert_eq!(
438             write_bytes.as_slice(),
439             &[
440                 0b00100001, //
441                 0b11011111, //
442                 0b11011111, //
443                 0b11011111, //
444                 0b11011111, //
445                 0b11011111, //
446                 0b11011111, //
447                 0b11011111, //
448                 0b11011111, //
449                 0b01011111, //
450             ]
451         );
452     }
453 
454     #[test]
text_extended_max()455     fn text_extended_max() {
456         let write_bytes = write_varint_meta3(MAX_VARINT);
457         assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
458         let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
459         let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
460         assert!(remainder.is_empty());
461         assert_eq!(recovered_value, MAX_VARINT);
462         assert_eq!(
463             write_bytes.as_slice(),
464             &[
465                 0b00010001, //
466                 0b11101111, //
467                 0b11101111, //
468                 0b11101111, //
469                 0b11101111, //
470                 0b11101111, //
471                 0b11101111, //
472                 0b11101111, //
473                 0b11101111, //
474                 0b01101111, //
475             ]
476         );
477     }
478 
479     #[test]
test_latent_values()480     fn test_latent_values() {
481         // Same values documented in the module docs: M=2
482         let m2 = read_varint_meta2;
483         assert_eq!(m2(0, &[]).0, 0);
484         assert_eq!(m2(0x20, &[0x00]).0, 32);
485         assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
486         assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
487         assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
488 
489         // Same values documented in the module docs: M=3
490         let m3 = read_varint_meta3;
491         assert_eq!(m3(0, &[]).0, 0);
492         assert_eq!(m3(0x10, &[0x00]).0, 16);
493         assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
494         assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
495         assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
496     }
497 }
498