• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! The implementation of XXH32.
2 
3 use core::{
4     fmt,
5     hash::{self, BuildHasher},
6     mem,
7 };
8 
9 use crate::{IntoU32, IntoU64};
10 
11 // Keeping these constants in this form to match the C code.
12 const PRIME32_1: u32 = 0x9E3779B1;
13 const PRIME32_2: u32 = 0x85EBCA77;
14 const PRIME32_3: u32 = 0xC2B2AE3D;
15 const PRIME32_4: u32 = 0x27D4EB2F;
16 const PRIME32_5: u32 = 0x165667B1;
17 
18 type Lane = u32;
19 type Lanes = [Lane; 4];
20 type Bytes = [u8; 16];
21 
22 const BYTES_IN_LANE: usize = mem::size_of::<Bytes>();
23 
24 #[derive(Clone, PartialEq)]
25 struct BufferData(Lanes);
26 
27 impl BufferData {
new() -> Self28     const fn new() -> Self {
29         Self([0; 4])
30     }
31 
bytes(&self) -> &Bytes32     const fn bytes(&self) -> &Bytes {
33         const _: () = assert!(mem::align_of::<u8>() <= mem::align_of::<Lane>());
34         // SAFETY[bytes]: The alignment of `u32` is at least that of
35         // `u8` and all the values are initialized.
36         unsafe { &*self.0.as_ptr().cast() }
37     }
38 
bytes_mut(&mut self) -> &mut Bytes39     fn bytes_mut(&mut self) -> &mut Bytes {
40         // SAFETY: See SAFETY[bytes]
41         unsafe { &mut *self.0.as_mut_ptr().cast() }
42     }
43 }
44 
45 impl fmt::Debug for BufferData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result46     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47         f.debug_list().entries(self.0.iter()).finish()
48     }
49 }
50 
51 #[derive(Debug, Clone, PartialEq)]
52 struct Buffer {
53     offset: usize,
54     data: BufferData,
55 }
56 
57 impl Buffer {
new() -> Self58     const fn new() -> Self {
59         Self {
60             offset: 0,
61             data: BufferData::new(),
62         }
63     }
64 
65     // RATIONALE: See RATIONALE[inline]
66     #[inline]
extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8])67     fn extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8]) {
68         // Most of the slice methods we use here have `_unchecked` variants, but
69         //
70         // 1. this method is called one time per `Hasher::write` call
71         // 2. this method early exits if we don't have anything in the buffer
72         //
73         // Because of this, removing the panics via `unsafe` doesn't
74         // have much benefit other than reducing code size by a tiny
75         // bit.
76 
77         if self.offset == 0 {
78             return (None, data);
79         };
80 
81         let bytes = self.data.bytes_mut();
82         debug_assert!(self.offset <= bytes.len());
83 
84         let empty = &mut bytes[self.offset..];
85         let n_to_copy = usize::min(empty.len(), data.len());
86 
87         let dst = &mut empty[..n_to_copy];
88 
89         let (src, rest) = data.split_at(n_to_copy);
90 
91         dst.copy_from_slice(src);
92         self.offset += n_to_copy;
93 
94         debug_assert!(self.offset <= bytes.len());
95 
96         if self.offset == bytes.len() {
97             self.offset = 0;
98             (Some(&self.data.0), rest)
99         } else {
100             (None, rest)
101         }
102     }
103 
104     // RATIONALE: See RATIONALE[inline]
105     #[inline]
set(&mut self, data: &[u8])106     fn set(&mut self, data: &[u8]) {
107         if data.is_empty() {
108             return;
109         }
110 
111         debug_assert_eq!(self.offset, 0);
112 
113         let n_to_copy = data.len();
114 
115         let bytes = self.data.bytes_mut();
116         debug_assert!(n_to_copy < bytes.len());
117 
118         bytes[..n_to_copy].copy_from_slice(data);
119         self.offset = data.len();
120     }
121 
122     // RATIONALE: See RATIONALE[inline]
123     #[inline]
remaining(&self) -> &[u8]124     fn remaining(&self) -> &[u8] {
125         &self.data.bytes()[..self.offset]
126     }
127 }
128 
129 #[derive(Clone, PartialEq)]
130 struct Accumulators(Lanes);
131 
132 impl Accumulators {
new(seed: u32) -> Self133     const fn new(seed: u32) -> Self {
134         Self([
135             seed.wrapping_add(PRIME32_1).wrapping_add(PRIME32_2),
136             seed.wrapping_add(PRIME32_2),
137             seed,
138             seed.wrapping_sub(PRIME32_1),
139         ])
140     }
141 
142     // RATIONALE: See RATIONALE[inline]
143     #[inline]
write(&mut self, lanes: Lanes)144     fn write(&mut self, lanes: Lanes) {
145         let [acc1, acc2, acc3, acc4] = &mut self.0;
146         let [lane1, lane2, lane3, lane4] = lanes;
147 
148         *acc1 = round(*acc1, lane1.to_le());
149         *acc2 = round(*acc2, lane2.to_le());
150         *acc3 = round(*acc3, lane3.to_le());
151         *acc4 = round(*acc4, lane4.to_le());
152     }
153 
154     // RATIONALE: See RATIONALE[inline]
155     #[inline]
write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8]156     fn write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8] {
157         while let Some((chunk, rest)) = data.split_first_chunk::<BYTES_IN_LANE>() {
158             // SAFETY: We have the right number of bytes and are
159             // handling the unaligned case.
160             let lanes = unsafe { chunk.as_ptr().cast::<Lanes>().read_unaligned() };
161             self.write(lanes);
162             data = rest;
163         }
164         data
165     }
166 
167     // RATIONALE: See RATIONALE[inline]
168     #[inline]
finish(&self) -> u32169     const fn finish(&self) -> u32 {
170         let [acc1, acc2, acc3, acc4] = self.0;
171 
172         let acc1 = acc1.rotate_left(1);
173         let acc2 = acc2.rotate_left(7);
174         let acc3 = acc3.rotate_left(12);
175         let acc4 = acc4.rotate_left(18);
176 
177         acc1.wrapping_add(acc2)
178             .wrapping_add(acc3)
179             .wrapping_add(acc4)
180     }
181 }
182 
183 impl fmt::Debug for Accumulators {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result184     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
185         let [acc1, acc2, acc3, acc4] = self.0;
186         f.debug_struct("Accumulators")
187             .field("acc1", &acc1)
188             .field("acc2", &acc2)
189             .field("acc3", &acc3)
190             .field("acc4", &acc4)
191             .finish()
192     }
193 }
194 
195 /// Calculates the 32-bit hash.
196 ///
197 /// ### Caution
198 ///
199 /// Although this struct implements [`hash::Hasher`][], it only calculates a
200 /// 32-bit number, leaving the upper bits as 0. This means it is
201 /// unlikely to be correct to use this in places like a [`HashMap`][std::collections::HashMap].
202 #[derive(Debug, Clone, PartialEq)]
203 pub struct Hasher {
204     seed: u32,
205     accumulators: Accumulators,
206     buffer: Buffer,
207     length: u64,
208 }
209 
210 impl Default for Hasher {
default() -> Self211     fn default() -> Self {
212         Self::with_seed(0)
213     }
214 }
215 
216 impl Hasher {
217     /// Hash all data at once. If you can use this function, you may
218     /// see noticable speed gains for certain types of input.
219     #[must_use]
220     // RATIONALE[inline]: Keeping parallel to the 64-bit
221     // implementation, even though the performance gains for the
222     // 32-bit version haven't been tested.
223     #[inline]
oneshot(seed: u32, data: &[u8]) -> u32224     pub fn oneshot(seed: u32, data: &[u8]) -> u32 {
225         let len = data.len();
226 
227         // Since we know that there's no more data coming, we don't
228         // need to construct the intermediate buffers or copy data to
229         // or from the buffers.
230 
231         let mut accumulators = Accumulators::new(seed);
232 
233         let data = accumulators.write_many(data);
234 
235         Self::finish_with(seed, len.into_u64(), &accumulators, data)
236     }
237 
238     /// Constructs the hasher with an initial seed.
239     #[must_use]
with_seed(seed: u32) -> Self240     pub const fn with_seed(seed: u32) -> Self {
241         // Step 1. Initialize internal accumulators
242         Self {
243             seed,
244             accumulators: Accumulators::new(seed),
245             buffer: Buffer::new(),
246             length: 0,
247         }
248     }
249 
250     /// The seed this hasher was created with.
seed(&self) -> u32251     pub const fn seed(&self) -> u32 {
252         self.seed
253     }
254 
255     /// The total number of bytes hashed.
total_len(&self) -> u64256     pub const fn total_len(&self) -> u64 {
257         self.length
258     }
259 
260     /// The total number of bytes hashed, truncated to 32 bits.
261     ///
262     /// For the full 64-bit byte count, use [`total_len`](Self::total_len).
total_len_32(&self) -> u32263     pub const fn total_len_32(&self) -> u32 {
264         self.length as u32
265     }
266 
267     /// Returns the hash value for the values written so far. Unlike
268     /// [`hash::Hasher::finish`][], this method returns the actual 32-bit
269     /// value calculated, not a 64-bit value.
270     #[must_use]
271     // RATIONALE: See RATIONALE[inline]
272     #[inline]
finish_32(&self) -> u32273     pub fn finish_32(&self) -> u32 {
274         Self::finish_with(
275             self.seed,
276             self.length,
277             &self.accumulators,
278             self.buffer.remaining(),
279         )
280     }
281 
282     #[must_use]
283     // RATIONALE: See RATIONALE[inline]
284     #[inline]
finish_with(seed: u32, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u32285     fn finish_with(seed: u32, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u32 {
286         // Step 3. Accumulator convergence
287         let mut acc = if len < BYTES_IN_LANE.into_u64() {
288             seed.wrapping_add(PRIME32_5)
289         } else {
290             accumulators.finish()
291         };
292 
293         // Step 4. Add input length
294         //
295         // "Note that, if input length is so large that it requires
296         // more than 32-bits, only the lower 32-bits are added to the
297         // accumulator."
298         acc += len as u32;
299 
300         // Step 5. Consume remaining input
301         while let Some((chunk, rest)) = remaining.split_first_chunk() {
302             let lane = u32::from_ne_bytes(*chunk).to_le();
303 
304             acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_3));
305             acc = acc.rotate_left(17).wrapping_mul(PRIME32_4);
306 
307             remaining = rest;
308         }
309 
310         for &byte in remaining {
311             let lane = byte.into_u32();
312 
313             acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_5));
314             acc = acc.rotate_left(11).wrapping_mul(PRIME32_1);
315         }
316 
317         // Step 6. Final mix (avalanche)
318         acc ^= acc >> 15;
319         acc = acc.wrapping_mul(PRIME32_2);
320         acc ^= acc >> 13;
321         acc = acc.wrapping_mul(PRIME32_3);
322         acc ^= acc >> 16;
323 
324         acc
325     }
326 }
327 
328 impl hash::Hasher for Hasher {
329     // RATIONALE: See RATIONALE[inline]
330     #[inline]
write(&mut self, data: &[u8])331     fn write(&mut self, data: &[u8]) {
332         let len = data.len();
333 
334         // Step 2. Process stripes
335         let (buffered_lanes, data) = self.buffer.extend(data);
336 
337         if let Some(&lanes) = buffered_lanes {
338             self.accumulators.write(lanes);
339         }
340 
341         let data = self.accumulators.write_many(data);
342 
343         self.buffer.set(data);
344 
345         self.length += len.into_u64();
346     }
347 
348     // RATIONALE: See RATIONALE[inline]
349     #[inline]
finish(&self) -> u64350     fn finish(&self) -> u64 {
351         Hasher::finish_32(self).into()
352     }
353 }
354 
355 // RATIONALE: See RATIONALE[inline]
356 #[inline]
round(mut acc: u32, lane: u32) -> u32357 const fn round(mut acc: u32, lane: u32) -> u32 {
358     acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_2));
359     acc = acc.rotate_left(13);
360     acc.wrapping_mul(PRIME32_1)
361 }
362 
363 /// Constructs [`Hasher`][] for multiple hasher instances. See
364 /// the [usage warning][Hasher#caution].
365 #[derive(Clone)]
366 pub struct State(u32);
367 
368 impl State {
369     /// Constructs the hasher with an initial seed.
with_seed(seed: u32) -> Self370     pub fn with_seed(seed: u32) -> Self {
371         Self(seed)
372     }
373 }
374 
375 impl BuildHasher for State {
376     type Hasher = Hasher;
377 
build_hasher(&self) -> Self::Hasher378     fn build_hasher(&self) -> Self::Hasher {
379         Hasher::with_seed(self.0)
380     }
381 }
382 
383 #[cfg(test)]
384 mod test {
385     use core::{
386         array,
387         hash::{BuildHasherDefault, Hasher as _},
388     };
389     use std::collections::HashMap;
390 
391     use super::*;
392 
393     const _TRAITS: () = {
is_clone<T: Clone>()394         const fn is_clone<T: Clone>() {}
395         is_clone::<Hasher>();
396         is_clone::<State>();
397     };
398 
399     const EMPTY_BYTES: [u8; 0] = [];
400 
401     #[test]
ingesting_byte_by_byte_is_equivalent_to_large_chunks()402     fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
403         let bytes = [0; 32];
404 
405         let mut byte_by_byte = Hasher::with_seed(0);
406         for byte in bytes.chunks(1) {
407             byte_by_byte.write(byte);
408         }
409         let byte_by_byte = byte_by_byte.finish();
410 
411         let mut one_chunk = Hasher::with_seed(0);
412         one_chunk.write(&bytes);
413         let one_chunk = one_chunk.finish();
414 
415         assert_eq!(byte_by_byte, one_chunk);
416     }
417 
418     #[test]
hash_of_nothing_matches_c_implementation()419     fn hash_of_nothing_matches_c_implementation() {
420         let mut hasher = Hasher::with_seed(0);
421         hasher.write(&EMPTY_BYTES);
422         assert_eq!(hasher.finish(), 0x02cc_5d05);
423     }
424 
425     #[test]
hash_of_single_byte_matches_c_implementation()426     fn hash_of_single_byte_matches_c_implementation() {
427         let mut hasher = Hasher::with_seed(0);
428         hasher.write(&[42]);
429         assert_eq!(hasher.finish(), 0xe0fe_705f);
430     }
431 
432     #[test]
hash_of_multiple_bytes_matches_c_implementation()433     fn hash_of_multiple_bytes_matches_c_implementation() {
434         let mut hasher = Hasher::with_seed(0);
435         hasher.write(b"Hello, world!\0");
436         assert_eq!(hasher.finish(), 0x9e5e_7e93);
437     }
438 
439     #[test]
hash_of_multiple_chunks_matches_c_implementation()440     fn hash_of_multiple_chunks_matches_c_implementation() {
441         let bytes: [u8; 100] = array::from_fn(|i| i as u8);
442         let mut hasher = Hasher::with_seed(0);
443         hasher.write(&bytes);
444         assert_eq!(hasher.finish(), 0x7f89_ba44);
445     }
446 
447     #[test]
hash_with_different_seed_matches_c_implementation()448     fn hash_with_different_seed_matches_c_implementation() {
449         let mut hasher = Hasher::with_seed(0x42c9_1977);
450         hasher.write(&EMPTY_BYTES);
451         assert_eq!(hasher.finish(), 0xd6bf_8459);
452     }
453 
454     #[test]
hash_with_different_seed_and_multiple_chunks_matches_c_implementation()455     fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
456         let bytes: [u8; 100] = array::from_fn(|i| i as u8);
457         let mut hasher = Hasher::with_seed(0x42c9_1977);
458         hasher.write(&bytes);
459         assert_eq!(hasher.finish(), 0x6d2f_6c17);
460     }
461 
462     #[test]
hashes_with_different_offsets_are_the_same()463     fn hashes_with_different_offsets_are_the_same() {
464         let bytes = [0x7c; 4096];
465         let expected = Hasher::oneshot(0, &[0x7c; 64]);
466 
467         let the_same = bytes
468             .windows(64)
469             .map(|w| {
470                 let mut hasher = Hasher::with_seed(0);
471                 hasher.write(w);
472                 hasher.finish_32()
473             })
474             .all(|h| h == expected);
475         assert!(the_same);
476     }
477 
478     // This test validates wraparound/truncation behavior for very
479     // large inputs of a 32-bit hash, but runs very slowly in the
480     // normal "cargo test" build config since it hashes 4.3GB of
481     // data. It runs reasonably quick under "cargo test --release".
482     #[ignore]
483     #[test]
length_overflows_32bit()484     fn length_overflows_32bit() {
485         // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32.
486         let bytes200: [u8; 200] = array::from_fn(|i| i as _);
487 
488         let mut hasher = Hasher::with_seed(0);
489         for _ in 0..(4_300_000_000 / bytes200.len()) {
490             hasher.write(&bytes200);
491         }
492 
493         assert_eq!(hasher.total_len(), 0x0000_0001_004c_cb00);
494         assert_eq!(hasher.total_len_32(), 0x004c_cb00);
495 
496         // compared against the C implementation
497         assert_eq!(hasher.finish(), 0x1522_4ca7);
498     }
499 
500     #[test]
can_be_used_in_a_hashmap_with_a_default_seed()501     fn can_be_used_in_a_hashmap_with_a_default_seed() {
502         let mut hash: HashMap<_, _, BuildHasherDefault<Hasher>> = Default::default();
503         hash.insert(42, "the answer");
504         assert_eq!(hash.get(&42), Some(&"the answer"));
505     }
506 }
507 
508 #[cfg(feature = "random")]
509 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
510 mod random_impl {
511     use super::*;
512 
513     /// Constructs a randomized seed and reuses it for multiple hasher
514     /// instances. See the [usage warning][Hasher#caution].
515     #[derive(Clone)]
516     pub struct RandomState(State);
517 
518     impl Default for RandomState {
default() -> Self519         fn default() -> Self {
520             Self::new()
521         }
522     }
523 
524     impl RandomState {
new() -> Self525         fn new() -> Self {
526             Self(State::with_seed(rand::random()))
527         }
528     }
529 
530     impl BuildHasher for RandomState {
531         type Hasher = Hasher;
532 
build_hasher(&self) -> Self::Hasher533         fn build_hasher(&self) -> Self::Hasher {
534             self.0.build_hasher()
535         }
536     }
537 
538     #[cfg(test)]
539     mod test {
540         use std::collections::HashMap;
541 
542         use super::*;
543 
544         const _: () = {
is_clone<T: Clone>()545             const fn is_clone<T: Clone>() {}
546             is_clone::<Hasher>();
547             is_clone::<RandomState>();
548         };
549 
550         #[test]
can_be_used_in_a_hashmap_with_a_random_seed()551         fn can_be_used_in_a_hashmap_with_a_random_seed() {
552             let mut hash: HashMap<_, _, RandomState> = Default::default();
553             hash.insert(42, "the answer");
554             assert_eq!(hash.get(&42), Some(&"the answer"));
555         }
556     }
557 }
558 
559 #[cfg(feature = "random")]
560 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
561 pub use random_impl::*;
562 
563 #[cfg(feature = "serialize")]
564 #[cfg_attr(docsrs, doc(cfg(feature = "serialize")))]
565 mod serialize_impl {
566     use serde::{Deserialize, Serialize};
567 
568     use super::*;
569 
570     impl<'de> Deserialize<'de> for Hasher {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,571         fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
572         where
573             D: serde::Deserializer<'de>,
574         {
575             let shim = Deserialize::deserialize(deserializer)?;
576 
577             let Shim {
578                 total_len,
579                 seed,
580                 core,
581                 buffer,
582                 buffer_usage,
583             } = shim;
584             let Core { v1, v2, v3, v4 } = core;
585 
586             let mut buffer_data = BufferData::new();
587             buffer_data.bytes_mut().copy_from_slice(&buffer);
588 
589             Ok(Hasher {
590                 seed,
591                 accumulators: Accumulators([v1, v2, v3, v4]),
592                 buffer: Buffer {
593                     offset: buffer_usage,
594                     data: buffer_data,
595                 },
596                 length: total_len,
597             })
598         }
599     }
600 
601     impl Serialize for Hasher {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,602         fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
603         where
604             S: serde::Serializer,
605         {
606             let Hasher {
607                 seed,
608                 ref accumulators,
609                 ref buffer,
610                 length,
611             } = *self;
612             let [v1, v2, v3, v4] = accumulators.0;
613             let Buffer { offset, ref data } = *buffer;
614             let buffer = *data.bytes();
615 
616             let shim = Shim {
617                 total_len: length,
618                 seed,
619                 core: Core { v1, v2, v3, v4 },
620                 buffer,
621                 buffer_usage: offset,
622             };
623 
624             shim.serialize(serializer)
625         }
626     }
627 
628     #[derive(Serialize, Deserialize)]
629     struct Shim {
630         total_len: u64,
631         seed: u32,
632         core: Core,
633         buffer: [u8; 16],
634         buffer_usage: usize,
635     }
636 
637     #[derive(Serialize, Deserialize)]
638     struct Core {
639         v1: u32,
640         v2: u32,
641         v3: u32,
642         v4: u32,
643     }
644 
645     #[cfg(test)]
646     mod test {
647         use std::hash::Hasher as _;
648 
649         use super::*;
650 
651         type Result<T = (), E = serde_json::Error> = core::result::Result<T, E>;
652 
653         #[test]
test_serialization_cycle() -> Result654         fn test_serialization_cycle() -> Result {
655             let mut hasher = Hasher::with_seed(0);
656             hasher.write(b"Hello, world!\0");
657             let _ = hasher.finish();
658 
659             let serialized = serde_json::to_string(&hasher)?;
660             let unserialized: Hasher = serde_json::from_str(&serialized)?;
661             assert_eq!(hasher, unserialized);
662             Ok(())
663         }
664 
665         #[test]
test_serialization_stability() -> Result666         fn test_serialization_stability() -> Result {
667             let mut hasher = Hasher::with_seed(0);
668             hasher.write(b"Hello, world!\0");
669             let _ = hasher.finish();
670 
671             let expected_serialized = r#"{
672                 "total_len": 14,
673                 "seed": 0,
674                 "core": {
675                   "v1": 606290984,
676                   "v2": 2246822519,
677                   "v3": 0,
678                   "v4": 1640531535
679                 },
680                 "buffer": [
681                   72,  101, 108, 108, 111, 44, 32, 119,
682                   111, 114, 108, 100, 33,  0,  0,  0
683                 ],
684                 "buffer_usage": 14
685             }"#;
686 
687             let unserialized: Hasher = serde_json::from_str(expected_serialized)?;
688             assert_eq!(hasher, unserialized);
689 
690             let expected_value: serde_json::Value = serde_json::from_str(expected_serialized)?;
691             let actual_value = serde_json::to_value(&hasher)?;
692             assert_eq!(expected_value, actual_value);
693 
694             Ok(())
695         }
696     }
697 }
698