1 //! The implementation of XXH32.
2
3 use core::{
4 fmt,
5 hash::{self, BuildHasher},
6 mem,
7 };
8
9 use crate::{IntoU32, IntoU64};
10
11 // Keeping these constants in this form to match the C code.
12 const PRIME32_1: u32 = 0x9E3779B1;
13 const PRIME32_2: u32 = 0x85EBCA77;
14 const PRIME32_3: u32 = 0xC2B2AE3D;
15 const PRIME32_4: u32 = 0x27D4EB2F;
16 const PRIME32_5: u32 = 0x165667B1;
17
18 type Lane = u32;
19 type Lanes = [Lane; 4];
20 type Bytes = [u8; 16];
21
22 const BYTES_IN_LANE: usize = mem::size_of::<Bytes>();
23
24 #[derive(Clone, PartialEq)]
25 struct BufferData(Lanes);
26
27 impl BufferData {
new() -> Self28 const fn new() -> Self {
29 Self([0; 4])
30 }
31
bytes(&self) -> &Bytes32 const fn bytes(&self) -> &Bytes {
33 const _: () = assert!(mem::align_of::<u8>() <= mem::align_of::<Lane>());
34 // SAFETY[bytes]: The alignment of `u32` is at least that of
35 // `u8` and all the values are initialized.
36 unsafe { &*self.0.as_ptr().cast() }
37 }
38
bytes_mut(&mut self) -> &mut Bytes39 fn bytes_mut(&mut self) -> &mut Bytes {
40 // SAFETY: See SAFETY[bytes]
41 unsafe { &mut *self.0.as_mut_ptr().cast() }
42 }
43 }
44
45 impl fmt::Debug for BufferData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result46 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47 f.debug_list().entries(self.0.iter()).finish()
48 }
49 }
50
51 #[derive(Debug, Clone, PartialEq)]
52 struct Buffer {
53 offset: usize,
54 data: BufferData,
55 }
56
57 impl Buffer {
new() -> Self58 const fn new() -> Self {
59 Self {
60 offset: 0,
61 data: BufferData::new(),
62 }
63 }
64
65 // RATIONALE: See RATIONALE[inline]
66 #[inline]
extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8])67 fn extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8]) {
68 // Most of the slice methods we use here have `_unchecked` variants, but
69 //
70 // 1. this method is called one time per `Hasher::write` call
71 // 2. this method early exits if we don't have anything in the buffer
72 //
73 // Because of this, removing the panics via `unsafe` doesn't
74 // have much benefit other than reducing code size by a tiny
75 // bit.
76
77 if self.offset == 0 {
78 return (None, data);
79 };
80
81 let bytes = self.data.bytes_mut();
82 debug_assert!(self.offset <= bytes.len());
83
84 let empty = &mut bytes[self.offset..];
85 let n_to_copy = usize::min(empty.len(), data.len());
86
87 let dst = &mut empty[..n_to_copy];
88
89 let (src, rest) = data.split_at(n_to_copy);
90
91 dst.copy_from_slice(src);
92 self.offset += n_to_copy;
93
94 debug_assert!(self.offset <= bytes.len());
95
96 if self.offset == bytes.len() {
97 self.offset = 0;
98 (Some(&self.data.0), rest)
99 } else {
100 (None, rest)
101 }
102 }
103
104 // RATIONALE: See RATIONALE[inline]
105 #[inline]
set(&mut self, data: &[u8])106 fn set(&mut self, data: &[u8]) {
107 if data.is_empty() {
108 return;
109 }
110
111 debug_assert_eq!(self.offset, 0);
112
113 let n_to_copy = data.len();
114
115 let bytes = self.data.bytes_mut();
116 debug_assert!(n_to_copy < bytes.len());
117
118 bytes[..n_to_copy].copy_from_slice(data);
119 self.offset = data.len();
120 }
121
122 // RATIONALE: See RATIONALE[inline]
123 #[inline]
remaining(&self) -> &[u8]124 fn remaining(&self) -> &[u8] {
125 &self.data.bytes()[..self.offset]
126 }
127 }
128
129 #[derive(Clone, PartialEq)]
130 struct Accumulators(Lanes);
131
132 impl Accumulators {
new(seed: u32) -> Self133 const fn new(seed: u32) -> Self {
134 Self([
135 seed.wrapping_add(PRIME32_1).wrapping_add(PRIME32_2),
136 seed.wrapping_add(PRIME32_2),
137 seed,
138 seed.wrapping_sub(PRIME32_1),
139 ])
140 }
141
142 // RATIONALE: See RATIONALE[inline]
143 #[inline]
write(&mut self, lanes: Lanes)144 fn write(&mut self, lanes: Lanes) {
145 let [acc1, acc2, acc3, acc4] = &mut self.0;
146 let [lane1, lane2, lane3, lane4] = lanes;
147
148 *acc1 = round(*acc1, lane1.to_le());
149 *acc2 = round(*acc2, lane2.to_le());
150 *acc3 = round(*acc3, lane3.to_le());
151 *acc4 = round(*acc4, lane4.to_le());
152 }
153
154 // RATIONALE: See RATIONALE[inline]
155 #[inline]
write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8]156 fn write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8] {
157 while let Some((chunk, rest)) = data.split_first_chunk::<BYTES_IN_LANE>() {
158 // SAFETY: We have the right number of bytes and are
159 // handling the unaligned case.
160 let lanes = unsafe { chunk.as_ptr().cast::<Lanes>().read_unaligned() };
161 self.write(lanes);
162 data = rest;
163 }
164 data
165 }
166
167 // RATIONALE: See RATIONALE[inline]
168 #[inline]
finish(&self) -> u32169 const fn finish(&self) -> u32 {
170 let [acc1, acc2, acc3, acc4] = self.0;
171
172 let acc1 = acc1.rotate_left(1);
173 let acc2 = acc2.rotate_left(7);
174 let acc3 = acc3.rotate_left(12);
175 let acc4 = acc4.rotate_left(18);
176
177 acc1.wrapping_add(acc2)
178 .wrapping_add(acc3)
179 .wrapping_add(acc4)
180 }
181 }
182
183 impl fmt::Debug for Accumulators {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result184 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
185 let [acc1, acc2, acc3, acc4] = self.0;
186 f.debug_struct("Accumulators")
187 .field("acc1", &acc1)
188 .field("acc2", &acc2)
189 .field("acc3", &acc3)
190 .field("acc4", &acc4)
191 .finish()
192 }
193 }
194
195 /// Calculates the 32-bit hash.
196 ///
197 /// ### Caution
198 ///
199 /// Although this struct implements [`hash::Hasher`][], it only calculates a
200 /// 32-bit number, leaving the upper bits as 0. This means it is
201 /// unlikely to be correct to use this in places like a [`HashMap`][std::collections::HashMap].
202 #[derive(Debug, Clone, PartialEq)]
203 pub struct Hasher {
204 seed: u32,
205 accumulators: Accumulators,
206 buffer: Buffer,
207 length: u64,
208 }
209
210 impl Default for Hasher {
default() -> Self211 fn default() -> Self {
212 Self::with_seed(0)
213 }
214 }
215
216 impl Hasher {
217 /// Hash all data at once. If you can use this function, you may
218 /// see noticable speed gains for certain types of input.
219 #[must_use]
220 // RATIONALE[inline]: Keeping parallel to the 64-bit
221 // implementation, even though the performance gains for the
222 // 32-bit version haven't been tested.
223 #[inline]
oneshot(seed: u32, data: &[u8]) -> u32224 pub fn oneshot(seed: u32, data: &[u8]) -> u32 {
225 let len = data.len();
226
227 // Since we know that there's no more data coming, we don't
228 // need to construct the intermediate buffers or copy data to
229 // or from the buffers.
230
231 let mut accumulators = Accumulators::new(seed);
232
233 let data = accumulators.write_many(data);
234
235 Self::finish_with(seed, len.into_u64(), &accumulators, data)
236 }
237
238 /// Constructs the hasher with an initial seed.
239 #[must_use]
with_seed(seed: u32) -> Self240 pub const fn with_seed(seed: u32) -> Self {
241 // Step 1. Initialize internal accumulators
242 Self {
243 seed,
244 accumulators: Accumulators::new(seed),
245 buffer: Buffer::new(),
246 length: 0,
247 }
248 }
249
250 /// The seed this hasher was created with.
seed(&self) -> u32251 pub const fn seed(&self) -> u32 {
252 self.seed
253 }
254
255 /// The total number of bytes hashed.
total_len(&self) -> u64256 pub const fn total_len(&self) -> u64 {
257 self.length
258 }
259
260 /// The total number of bytes hashed, truncated to 32 bits.
261 ///
262 /// For the full 64-bit byte count, use [`total_len`](Self::total_len).
total_len_32(&self) -> u32263 pub const fn total_len_32(&self) -> u32 {
264 self.length as u32
265 }
266
267 /// Returns the hash value for the values written so far. Unlike
268 /// [`hash::Hasher::finish`][], this method returns the actual 32-bit
269 /// value calculated, not a 64-bit value.
270 #[must_use]
271 // RATIONALE: See RATIONALE[inline]
272 #[inline]
finish_32(&self) -> u32273 pub fn finish_32(&self) -> u32 {
274 Self::finish_with(
275 self.seed,
276 self.length,
277 &self.accumulators,
278 self.buffer.remaining(),
279 )
280 }
281
282 #[must_use]
283 // RATIONALE: See RATIONALE[inline]
284 #[inline]
finish_with(seed: u32, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u32285 fn finish_with(seed: u32, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u32 {
286 // Step 3. Accumulator convergence
287 let mut acc = if len < BYTES_IN_LANE.into_u64() {
288 seed.wrapping_add(PRIME32_5)
289 } else {
290 accumulators.finish()
291 };
292
293 // Step 4. Add input length
294 //
295 // "Note that, if input length is so large that it requires
296 // more than 32-bits, only the lower 32-bits are added to the
297 // accumulator."
298 acc += len as u32;
299
300 // Step 5. Consume remaining input
301 while let Some((chunk, rest)) = remaining.split_first_chunk() {
302 let lane = u32::from_ne_bytes(*chunk).to_le();
303
304 acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_3));
305 acc = acc.rotate_left(17).wrapping_mul(PRIME32_4);
306
307 remaining = rest;
308 }
309
310 for &byte in remaining {
311 let lane = byte.into_u32();
312
313 acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_5));
314 acc = acc.rotate_left(11).wrapping_mul(PRIME32_1);
315 }
316
317 // Step 6. Final mix (avalanche)
318 acc ^= acc >> 15;
319 acc = acc.wrapping_mul(PRIME32_2);
320 acc ^= acc >> 13;
321 acc = acc.wrapping_mul(PRIME32_3);
322 acc ^= acc >> 16;
323
324 acc
325 }
326 }
327
328 impl hash::Hasher for Hasher {
329 // RATIONALE: See RATIONALE[inline]
330 #[inline]
write(&mut self, data: &[u8])331 fn write(&mut self, data: &[u8]) {
332 let len = data.len();
333
334 // Step 2. Process stripes
335 let (buffered_lanes, data) = self.buffer.extend(data);
336
337 if let Some(&lanes) = buffered_lanes {
338 self.accumulators.write(lanes);
339 }
340
341 let data = self.accumulators.write_many(data);
342
343 self.buffer.set(data);
344
345 self.length += len.into_u64();
346 }
347
348 // RATIONALE: See RATIONALE[inline]
349 #[inline]
finish(&self) -> u64350 fn finish(&self) -> u64 {
351 Hasher::finish_32(self).into()
352 }
353 }
354
355 // RATIONALE: See RATIONALE[inline]
356 #[inline]
round(mut acc: u32, lane: u32) -> u32357 const fn round(mut acc: u32, lane: u32) -> u32 {
358 acc = acc.wrapping_add(lane.wrapping_mul(PRIME32_2));
359 acc = acc.rotate_left(13);
360 acc.wrapping_mul(PRIME32_1)
361 }
362
363 /// Constructs [`Hasher`][] for multiple hasher instances. See
364 /// the [usage warning][Hasher#caution].
365 #[derive(Clone)]
366 pub struct State(u32);
367
368 impl State {
369 /// Constructs the hasher with an initial seed.
with_seed(seed: u32) -> Self370 pub fn with_seed(seed: u32) -> Self {
371 Self(seed)
372 }
373 }
374
375 impl BuildHasher for State {
376 type Hasher = Hasher;
377
build_hasher(&self) -> Self::Hasher378 fn build_hasher(&self) -> Self::Hasher {
379 Hasher::with_seed(self.0)
380 }
381 }
382
383 #[cfg(test)]
384 mod test {
385 use core::{
386 array,
387 hash::{BuildHasherDefault, Hasher as _},
388 };
389 use std::collections::HashMap;
390
391 use super::*;
392
393 const _TRAITS: () = {
is_clone<T: Clone>()394 const fn is_clone<T: Clone>() {}
395 is_clone::<Hasher>();
396 is_clone::<State>();
397 };
398
399 const EMPTY_BYTES: [u8; 0] = [];
400
401 #[test]
ingesting_byte_by_byte_is_equivalent_to_large_chunks()402 fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
403 let bytes = [0; 32];
404
405 let mut byte_by_byte = Hasher::with_seed(0);
406 for byte in bytes.chunks(1) {
407 byte_by_byte.write(byte);
408 }
409 let byte_by_byte = byte_by_byte.finish();
410
411 let mut one_chunk = Hasher::with_seed(0);
412 one_chunk.write(&bytes);
413 let one_chunk = one_chunk.finish();
414
415 assert_eq!(byte_by_byte, one_chunk);
416 }
417
418 #[test]
hash_of_nothing_matches_c_implementation()419 fn hash_of_nothing_matches_c_implementation() {
420 let mut hasher = Hasher::with_seed(0);
421 hasher.write(&EMPTY_BYTES);
422 assert_eq!(hasher.finish(), 0x02cc_5d05);
423 }
424
425 #[test]
hash_of_single_byte_matches_c_implementation()426 fn hash_of_single_byte_matches_c_implementation() {
427 let mut hasher = Hasher::with_seed(0);
428 hasher.write(&[42]);
429 assert_eq!(hasher.finish(), 0xe0fe_705f);
430 }
431
432 #[test]
hash_of_multiple_bytes_matches_c_implementation()433 fn hash_of_multiple_bytes_matches_c_implementation() {
434 let mut hasher = Hasher::with_seed(0);
435 hasher.write(b"Hello, world!\0");
436 assert_eq!(hasher.finish(), 0x9e5e_7e93);
437 }
438
439 #[test]
hash_of_multiple_chunks_matches_c_implementation()440 fn hash_of_multiple_chunks_matches_c_implementation() {
441 let bytes: [u8; 100] = array::from_fn(|i| i as u8);
442 let mut hasher = Hasher::with_seed(0);
443 hasher.write(&bytes);
444 assert_eq!(hasher.finish(), 0x7f89_ba44);
445 }
446
447 #[test]
hash_with_different_seed_matches_c_implementation()448 fn hash_with_different_seed_matches_c_implementation() {
449 let mut hasher = Hasher::with_seed(0x42c9_1977);
450 hasher.write(&EMPTY_BYTES);
451 assert_eq!(hasher.finish(), 0xd6bf_8459);
452 }
453
454 #[test]
hash_with_different_seed_and_multiple_chunks_matches_c_implementation()455 fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
456 let bytes: [u8; 100] = array::from_fn(|i| i as u8);
457 let mut hasher = Hasher::with_seed(0x42c9_1977);
458 hasher.write(&bytes);
459 assert_eq!(hasher.finish(), 0x6d2f_6c17);
460 }
461
462 #[test]
hashes_with_different_offsets_are_the_same()463 fn hashes_with_different_offsets_are_the_same() {
464 let bytes = [0x7c; 4096];
465 let expected = Hasher::oneshot(0, &[0x7c; 64]);
466
467 let the_same = bytes
468 .windows(64)
469 .map(|w| {
470 let mut hasher = Hasher::with_seed(0);
471 hasher.write(w);
472 hasher.finish_32()
473 })
474 .all(|h| h == expected);
475 assert!(the_same);
476 }
477
478 // This test validates wraparound/truncation behavior for very
479 // large inputs of a 32-bit hash, but runs very slowly in the
480 // normal "cargo test" build config since it hashes 4.3GB of
481 // data. It runs reasonably quick under "cargo test --release".
482 #[ignore]
483 #[test]
length_overflows_32bit()484 fn length_overflows_32bit() {
485 // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32.
486 let bytes200: [u8; 200] = array::from_fn(|i| i as _);
487
488 let mut hasher = Hasher::with_seed(0);
489 for _ in 0..(4_300_000_000 / bytes200.len()) {
490 hasher.write(&bytes200);
491 }
492
493 assert_eq!(hasher.total_len(), 0x0000_0001_004c_cb00);
494 assert_eq!(hasher.total_len_32(), 0x004c_cb00);
495
496 // compared against the C implementation
497 assert_eq!(hasher.finish(), 0x1522_4ca7);
498 }
499
500 #[test]
can_be_used_in_a_hashmap_with_a_default_seed()501 fn can_be_used_in_a_hashmap_with_a_default_seed() {
502 let mut hash: HashMap<_, _, BuildHasherDefault<Hasher>> = Default::default();
503 hash.insert(42, "the answer");
504 assert_eq!(hash.get(&42), Some(&"the answer"));
505 }
506 }
507
508 #[cfg(feature = "random")]
509 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
510 mod random_impl {
511 use super::*;
512
513 /// Constructs a randomized seed and reuses it for multiple hasher
514 /// instances. See the [usage warning][Hasher#caution].
515 #[derive(Clone)]
516 pub struct RandomState(State);
517
518 impl Default for RandomState {
default() -> Self519 fn default() -> Self {
520 Self::new()
521 }
522 }
523
524 impl RandomState {
new() -> Self525 fn new() -> Self {
526 Self(State::with_seed(rand::random()))
527 }
528 }
529
530 impl BuildHasher for RandomState {
531 type Hasher = Hasher;
532
build_hasher(&self) -> Self::Hasher533 fn build_hasher(&self) -> Self::Hasher {
534 self.0.build_hasher()
535 }
536 }
537
538 #[cfg(test)]
539 mod test {
540 use std::collections::HashMap;
541
542 use super::*;
543
544 const _: () = {
is_clone<T: Clone>()545 const fn is_clone<T: Clone>() {}
546 is_clone::<Hasher>();
547 is_clone::<RandomState>();
548 };
549
550 #[test]
can_be_used_in_a_hashmap_with_a_random_seed()551 fn can_be_used_in_a_hashmap_with_a_random_seed() {
552 let mut hash: HashMap<_, _, RandomState> = Default::default();
553 hash.insert(42, "the answer");
554 assert_eq!(hash.get(&42), Some(&"the answer"));
555 }
556 }
557 }
558
559 #[cfg(feature = "random")]
560 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
561 pub use random_impl::*;
562
563 #[cfg(feature = "serialize")]
564 #[cfg_attr(docsrs, doc(cfg(feature = "serialize")))]
565 mod serialize_impl {
566 use serde::{Deserialize, Serialize};
567
568 use super::*;
569
570 impl<'de> Deserialize<'de> for Hasher {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,571 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
572 where
573 D: serde::Deserializer<'de>,
574 {
575 let shim = Deserialize::deserialize(deserializer)?;
576
577 let Shim {
578 total_len,
579 seed,
580 core,
581 buffer,
582 buffer_usage,
583 } = shim;
584 let Core { v1, v2, v3, v4 } = core;
585
586 let mut buffer_data = BufferData::new();
587 buffer_data.bytes_mut().copy_from_slice(&buffer);
588
589 Ok(Hasher {
590 seed,
591 accumulators: Accumulators([v1, v2, v3, v4]),
592 buffer: Buffer {
593 offset: buffer_usage,
594 data: buffer_data,
595 },
596 length: total_len,
597 })
598 }
599 }
600
601 impl Serialize for Hasher {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,602 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
603 where
604 S: serde::Serializer,
605 {
606 let Hasher {
607 seed,
608 ref accumulators,
609 ref buffer,
610 length,
611 } = *self;
612 let [v1, v2, v3, v4] = accumulators.0;
613 let Buffer { offset, ref data } = *buffer;
614 let buffer = *data.bytes();
615
616 let shim = Shim {
617 total_len: length,
618 seed,
619 core: Core { v1, v2, v3, v4 },
620 buffer,
621 buffer_usage: offset,
622 };
623
624 shim.serialize(serializer)
625 }
626 }
627
628 #[derive(Serialize, Deserialize)]
629 struct Shim {
630 total_len: u64,
631 seed: u32,
632 core: Core,
633 buffer: [u8; 16],
634 buffer_usage: usize,
635 }
636
637 #[derive(Serialize, Deserialize)]
638 struct Core {
639 v1: u32,
640 v2: u32,
641 v3: u32,
642 v4: u32,
643 }
644
645 #[cfg(test)]
646 mod test {
647 use std::hash::Hasher as _;
648
649 use super::*;
650
651 type Result<T = (), E = serde_json::Error> = core::result::Result<T, E>;
652
653 #[test]
test_serialization_cycle() -> Result654 fn test_serialization_cycle() -> Result {
655 let mut hasher = Hasher::with_seed(0);
656 hasher.write(b"Hello, world!\0");
657 let _ = hasher.finish();
658
659 let serialized = serde_json::to_string(&hasher)?;
660 let unserialized: Hasher = serde_json::from_str(&serialized)?;
661 assert_eq!(hasher, unserialized);
662 Ok(())
663 }
664
665 #[test]
test_serialization_stability() -> Result666 fn test_serialization_stability() -> Result {
667 let mut hasher = Hasher::with_seed(0);
668 hasher.write(b"Hello, world!\0");
669 let _ = hasher.finish();
670
671 let expected_serialized = r#"{
672 "total_len": 14,
673 "seed": 0,
674 "core": {
675 "v1": 606290984,
676 "v2": 2246822519,
677 "v3": 0,
678 "v4": 1640531535
679 },
680 "buffer": [
681 72, 101, 108, 108, 111, 44, 32, 119,
682 111, 114, 108, 100, 33, 0, 0, 0
683 ],
684 "buffer_usage": 14
685 }"#;
686
687 let unserialized: Hasher = serde_json::from_str(expected_serialized)?;
688 assert_eq!(hasher, unserialized);
689
690 let expected_value: serde_json::Value = serde_json::from_str(expected_serialized)?;
691 let actual_value = serde_json::to_value(&hasher)?;
692 assert_eq!(expected_value, actual_value);
693
694 Ok(())
695 }
696 }
697 }
698