1 //! The implementation of XXH64.
2
3 use core::{
4 fmt,
5 hash::{self, BuildHasher},
6 mem,
7 };
8
9 use crate::IntoU64;
10
11 // Keeping these constants in this form to match the C code.
12 const PRIME64_1: u64 = 0x9E3779B185EBCA87;
13 const PRIME64_2: u64 = 0xC2B2AE3D27D4EB4F;
14 const PRIME64_3: u64 = 0x165667B19E3779F9;
15 const PRIME64_4: u64 = 0x85EBCA77C2B2AE63;
16 const PRIME64_5: u64 = 0x27D4EB2F165667C5;
17
18 type Lane = u64;
19 type Lanes = [Lane; 4];
20 type Bytes = [u8; 32];
21
22 const BYTES_IN_LANE: usize = mem::size_of::<Bytes>();
23
24 #[derive(Clone, PartialEq)]
25 struct BufferData(Lanes);
26
27 impl BufferData {
new() -> Self28 const fn new() -> Self {
29 Self([0; 4])
30 }
31
bytes(&self) -> &Bytes32 const fn bytes(&self) -> &Bytes {
33 const _: () = assert!(mem::align_of::<u8>() <= mem::align_of::<Lane>());
34 // SAFETY[bytes]: The alignment of `u64` is at least that of
35 // `u8` and all the values are initialized.
36 unsafe { &*self.0.as_ptr().cast() }
37 }
38
bytes_mut(&mut self) -> &mut Bytes39 fn bytes_mut(&mut self) -> &mut Bytes {
40 // SAFETY: See SAFETY[bytes]
41 unsafe { &mut *self.0.as_mut_ptr().cast() }
42 }
43 }
44
45 impl fmt::Debug for BufferData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result46 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47 f.debug_list().entries(self.0.iter()).finish()
48 }
49 }
50
51 #[derive(Debug, Clone, PartialEq)]
52 struct Buffer {
53 offset: usize,
54 data: BufferData,
55 }
56
57 impl Buffer {
new() -> Self58 const fn new() -> Self {
59 Self {
60 offset: 0,
61 data: BufferData::new(),
62 }
63 }
64
65 // RATIONALE: See RATIONALE[inline]
66 #[inline]
extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8])67 fn extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8]) {
68 // Most of the slice methods we use here have `_unchecked` variants, but
69 //
70 // 1. this method is called one time per `Hasher::write` call
71 // 2. this method early exits if we don't have anything in the buffer
72 //
73 // Because of this, removing the panics via `unsafe` doesn't
74 // have much benefit other than reducing code size by a tiny
75 // bit.
76
77 if self.offset == 0 {
78 return (None, data);
79 };
80
81 let bytes = self.data.bytes_mut();
82 debug_assert!(self.offset <= bytes.len());
83
84 let empty = &mut bytes[self.offset..];
85 let n_to_copy = usize::min(empty.len(), data.len());
86
87 let dst = &mut empty[..n_to_copy];
88
89 let (src, rest) = data.split_at(n_to_copy);
90
91 dst.copy_from_slice(src);
92 self.offset += n_to_copy;
93
94 debug_assert!(self.offset <= bytes.len());
95
96 if self.offset == bytes.len() {
97 self.offset = 0;
98 (Some(&self.data.0), rest)
99 } else {
100 (None, rest)
101 }
102 }
103
104 // RATIONALE: See RATIONALE[inline]
105 #[inline]
set(&mut self, data: &[u8])106 fn set(&mut self, data: &[u8]) {
107 if data.is_empty() {
108 return;
109 }
110
111 debug_assert_eq!(self.offset, 0);
112
113 let n_to_copy = data.len();
114
115 let bytes = self.data.bytes_mut();
116 debug_assert!(n_to_copy < bytes.len());
117
118 bytes[..n_to_copy].copy_from_slice(data);
119 self.offset = data.len();
120 }
121
122 // RATIONALE: See RATIONALE[inline]
123 #[inline]
remaining(&self) -> &[u8]124 fn remaining(&self) -> &[u8] {
125 &self.data.bytes()[..self.offset]
126 }
127 }
128
129 #[derive(Clone, PartialEq)]
130 struct Accumulators(Lanes);
131
132 impl Accumulators {
new(seed: u64) -> Self133 const fn new(seed: u64) -> Self {
134 Self([
135 seed.wrapping_add(PRIME64_1).wrapping_add(PRIME64_2),
136 seed.wrapping_add(PRIME64_2),
137 seed,
138 seed.wrapping_sub(PRIME64_1),
139 ])
140 }
141
142 // RATIONALE: See RATIONALE[inline]
143 #[inline]
write(&mut self, lanes: Lanes)144 fn write(&mut self, lanes: Lanes) {
145 let [acc1, acc2, acc3, acc4] = &mut self.0;
146 let [lane1, lane2, lane3, lane4] = lanes;
147
148 *acc1 = round(*acc1, lane1.to_le());
149 *acc2 = round(*acc2, lane2.to_le());
150 *acc3 = round(*acc3, lane3.to_le());
151 *acc4 = round(*acc4, lane4.to_le());
152 }
153
154 // RATIONALE: See RATIONALE[inline]
155 #[inline]
write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8]156 fn write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8] {
157 while let Some((chunk, rest)) = data.split_first_chunk::<BYTES_IN_LANE>() {
158 // SAFETY: We have the right number of bytes and are
159 // handling the unaligned case.
160 let lanes = unsafe { chunk.as_ptr().cast::<Lanes>().read_unaligned() };
161 self.write(lanes);
162 data = rest;
163 }
164 data
165 }
166
167 // RATIONALE: See RATIONALE[inline]
168 #[inline]
finish(&self) -> u64169 const fn finish(&self) -> u64 {
170 let [acc1, acc2, acc3, acc4] = self.0;
171
172 let mut acc = {
173 let acc1 = acc1.rotate_left(1);
174 let acc2 = acc2.rotate_left(7);
175 let acc3 = acc3.rotate_left(12);
176 let acc4 = acc4.rotate_left(18);
177
178 acc1.wrapping_add(acc2)
179 .wrapping_add(acc3)
180 .wrapping_add(acc4)
181 };
182
183 acc = Self::merge_accumulator(acc, acc1);
184 acc = Self::merge_accumulator(acc, acc2);
185 acc = Self::merge_accumulator(acc, acc3);
186 acc = Self::merge_accumulator(acc, acc4);
187
188 acc
189 }
190
191 // RATIONALE: See RATIONALE[inline]
192 #[inline]
merge_accumulator(mut acc: u64, acc_n: u64) -> u64193 const fn merge_accumulator(mut acc: u64, acc_n: u64) -> u64 {
194 acc ^= round(0, acc_n);
195 acc = acc.wrapping_mul(PRIME64_1);
196 acc.wrapping_add(PRIME64_4)
197 }
198 }
199
200 impl fmt::Debug for Accumulators {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result201 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
202 let [acc1, acc2, acc3, acc4] = self.0;
203 f.debug_struct("Accumulators")
204 .field("acc1", &acc1)
205 .field("acc2", &acc2)
206 .field("acc3", &acc3)
207 .field("acc4", &acc4)
208 .finish()
209 }
210 }
211
212 /// Calculates the 64-bit hash.
213 #[derive(Debug, Clone, PartialEq)]
214 pub struct Hasher {
215 seed: u64,
216 accumulators: Accumulators,
217 buffer: Buffer,
218 length: u64,
219 }
220
221 impl Default for Hasher {
default() -> Self222 fn default() -> Self {
223 Self::with_seed(0)
224 }
225 }
226
227 impl Hasher {
228 /// Hash all data at once. If you can use this function, you may
229 /// see noticable speed gains for certain types of input.
230 #[must_use]
231 // RATIONALE[inline]:
232 //
233 // These `inline`s help unlock a speedup in one benchmark [1] from
234 // ~900µs to ~200µs.
235 //
236 // Further inspection of the disassembly showed that various
237 // helper functions were not being inlined. Avoiding these few
238 // function calls wins us the tiniest performance increase, just
239 // enough so that we are neck-and-neck with (or slightly faster
240 // than!) the C code.
241 //
242 // This results in the entire hash computation being inlined at
243 // the call site.
244 //
245 // [1]: https://github.com/apache/datafusion-comet/pull/575
246 #[inline]
oneshot(seed: u64, data: &[u8]) -> u64247 pub fn oneshot(seed: u64, data: &[u8]) -> u64 {
248 let len = data.len();
249
250 // Since we know that there's no more data coming, we don't
251 // need to construct the intermediate buffers or copy data to
252 // or from the buffers.
253
254 let mut accumulators = Accumulators::new(seed);
255
256 let data = accumulators.write_many(data);
257
258 Self::finish_with(seed, len.into_u64(), &accumulators, data)
259 }
260
261 /// Constructs the hasher with an initial seed.
262 #[must_use]
with_seed(seed: u64) -> Self263 pub const fn with_seed(seed: u64) -> Self {
264 // Step 1. Initialize internal accumulators
265 Self {
266 seed,
267 accumulators: Accumulators::new(seed),
268 buffer: Buffer::new(),
269 length: 0,
270 }
271 }
272
273 /// The seed this hasher was created with.
seed(&self) -> u64274 pub const fn seed(&self) -> u64 {
275 self.seed
276 }
277
278 /// The total number of bytes hashed.
total_len(&self) -> u64279 pub const fn total_len(&self) -> u64 {
280 self.length
281 }
282
283 #[must_use]
284 // RATIONALE: See RATIONALE[inline]
285 #[inline]
finish_with(seed: u64, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u64286 fn finish_with(seed: u64, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u64 {
287 // Step 3. Accumulator convergence
288 let mut acc = if len < BYTES_IN_LANE.into_u64() {
289 seed.wrapping_add(PRIME64_5)
290 } else {
291 accumulators.finish()
292 };
293
294 // Step 4. Add input length
295 acc += len;
296
297 // Step 5. Consume remaining input
298 while let Some((chunk, rest)) = remaining.split_first_chunk() {
299 let lane = u64::from_ne_bytes(*chunk).to_le();
300
301 acc ^= round(0, lane);
302 acc = acc.rotate_left(27).wrapping_mul(PRIME64_1);
303 acc = acc.wrapping_add(PRIME64_4);
304 remaining = rest;
305 }
306
307 while let Some((chunk, rest)) = remaining.split_first_chunk() {
308 let lane = u32::from_ne_bytes(*chunk).to_le().into_u64();
309
310 acc ^= lane.wrapping_mul(PRIME64_1);
311 acc = acc.rotate_left(23).wrapping_mul(PRIME64_2);
312 acc = acc.wrapping_add(PRIME64_3);
313
314 remaining = rest;
315 }
316
317 for &byte in remaining {
318 let lane = byte.into_u64();
319
320 acc ^= lane.wrapping_mul(PRIME64_5);
321 acc = acc.rotate_left(11).wrapping_mul(PRIME64_1);
322 }
323
324 // Step 6. Final mix (avalanche)
325 acc ^= acc >> 33;
326 acc = acc.wrapping_mul(PRIME64_2);
327 acc ^= acc >> 29;
328 acc = acc.wrapping_mul(PRIME64_3);
329 acc ^= acc >> 32;
330
331 acc
332 }
333 }
334
335 impl hash::Hasher for Hasher {
336 // RATIONALE: See RATIONALE[inline]
337 #[inline]
write(&mut self, data: &[u8])338 fn write(&mut self, data: &[u8]) {
339 let len = data.len();
340
341 // Step 2. Process stripes
342 let (buffered_lanes, data) = self.buffer.extend(data);
343
344 if let Some(&lanes) = buffered_lanes {
345 self.accumulators.write(lanes);
346 }
347
348 let data = self.accumulators.write_many(data);
349
350 self.buffer.set(data);
351
352 self.length += len.into_u64();
353 }
354
355 // RATIONALE: See RATIONALE[inline]
356 #[inline]
finish(&self) -> u64357 fn finish(&self) -> u64 {
358 Self::finish_with(
359 self.seed,
360 self.length,
361 &self.accumulators,
362 self.buffer.remaining(),
363 )
364 }
365 }
366
367 // RATIONALE: See RATIONALE[inline]
368 #[inline]
round(mut acc: u64, lane: u64) -> u64369 const fn round(mut acc: u64, lane: u64) -> u64 {
370 acc = acc.wrapping_add(lane.wrapping_mul(PRIME64_2));
371 acc = acc.rotate_left(31);
372 acc.wrapping_mul(PRIME64_1)
373 }
374
375 /// Constructs [`Hasher`][] for multiple hasher instances.
376 #[derive(Clone)]
377 pub struct State(u64);
378
379 impl State {
380 /// Constructs the hasher with an initial seed.
with_seed(seed: u64) -> Self381 pub fn with_seed(seed: u64) -> Self {
382 Self(seed)
383 }
384 }
385
386 impl BuildHasher for State {
387 type Hasher = Hasher;
388
build_hasher(&self) -> Self::Hasher389 fn build_hasher(&self) -> Self::Hasher {
390 Hasher::with_seed(self.0)
391 }
392 }
393
394 #[cfg(test)]
395 mod test {
396 use core::{
397 array,
398 hash::{BuildHasherDefault, Hasher as _},
399 };
400 use std::collections::HashMap;
401
402 use super::*;
403
404 const _TRAITS: () = {
is_clone<T: Clone>()405 const fn is_clone<T: Clone>() {}
406 is_clone::<Hasher>();
407 is_clone::<State>();
408 };
409
410 const EMPTY_BYTES: [u8; 0] = [];
411
412 #[test]
ingesting_byte_by_byte_is_equivalent_to_large_chunks()413 fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
414 let bytes = [0x9c; 32];
415
416 let mut byte_by_byte = Hasher::with_seed(0);
417 for byte in bytes.chunks(1) {
418 byte_by_byte.write(byte);
419 }
420 let byte_by_byte = byte_by_byte.finish();
421
422 let mut one_chunk = Hasher::with_seed(0);
423 one_chunk.write(&bytes);
424 let one_chunk = one_chunk.finish();
425
426 assert_eq!(byte_by_byte, one_chunk);
427 }
428
429 #[test]
hash_of_nothing_matches_c_implementation()430 fn hash_of_nothing_matches_c_implementation() {
431 let mut hasher = Hasher::with_seed(0);
432 hasher.write(&EMPTY_BYTES);
433 assert_eq!(hasher.finish(), 0xef46_db37_51d8_e999);
434 }
435
436 #[test]
hash_of_single_byte_matches_c_implementation()437 fn hash_of_single_byte_matches_c_implementation() {
438 let mut hasher = Hasher::with_seed(0);
439 hasher.write(&[42]);
440 assert_eq!(hasher.finish(), 0x0a9e_dece_beb0_3ae4);
441 }
442
443 #[test]
hash_of_multiple_bytes_matches_c_implementation()444 fn hash_of_multiple_bytes_matches_c_implementation() {
445 let mut hasher = Hasher::with_seed(0);
446 hasher.write(b"Hello, world!\0");
447 assert_eq!(hasher.finish(), 0x7b06_c531_ea43_e89f);
448 }
449
450 #[test]
hash_of_multiple_chunks_matches_c_implementation()451 fn hash_of_multiple_chunks_matches_c_implementation() {
452 let bytes: [u8; 100] = array::from_fn(|i| i as u8);
453 let mut hasher = Hasher::with_seed(0);
454 hasher.write(&bytes);
455 assert_eq!(hasher.finish(), 0x6ac1_e580_3216_6597);
456 }
457
458 #[test]
hash_with_different_seed_matches_c_implementation()459 fn hash_with_different_seed_matches_c_implementation() {
460 let mut hasher = Hasher::with_seed(0xae05_4331_1b70_2d91);
461 hasher.write(&EMPTY_BYTES);
462 assert_eq!(hasher.finish(), 0x4b6a_04fc_df7a_4672);
463 }
464
465 #[test]
hash_with_different_seed_and_multiple_chunks_matches_c_implementation()466 fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
467 let bytes: [u8; 100] = array::from_fn(|i| i as u8);
468 let mut hasher = Hasher::with_seed(0xae05_4331_1b70_2d91);
469 hasher.write(&bytes);
470 assert_eq!(hasher.finish(), 0x567e_355e_0682_e1f1);
471 }
472
473 #[test]
hashes_with_different_offsets_are_the_same()474 fn hashes_with_different_offsets_are_the_same() {
475 let bytes = [0x7c; 4096];
476 let expected = Hasher::oneshot(0, &[0x7c; 64]);
477
478 let the_same = bytes
479 .windows(64)
480 .map(|w| {
481 let mut hasher = Hasher::with_seed(0);
482 hasher.write(w);
483 hasher.finish()
484 })
485 .all(|h| h == expected);
486 assert!(the_same);
487 }
488
489 #[test]
can_be_used_in_a_hashmap_with_a_default_seed()490 fn can_be_used_in_a_hashmap_with_a_default_seed() {
491 let mut hash: HashMap<_, _, BuildHasherDefault<Hasher>> = Default::default();
492 hash.insert(42, "the answer");
493 assert_eq!(hash.get(&42), Some(&"the answer"));
494 }
495 }
496
497 #[cfg(feature = "random")]
498 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
499 mod random_impl {
500 use super::*;
501
502 /// Constructs a randomized seed and reuses it for multiple hasher
503 /// instances.
504 #[derive(Clone)]
505 pub struct RandomState(State);
506
507 impl Default for RandomState {
default() -> Self508 fn default() -> Self {
509 Self::new()
510 }
511 }
512
513 impl RandomState {
new() -> Self514 fn new() -> Self {
515 Self(State::with_seed(rand::random()))
516 }
517 }
518
519 impl BuildHasher for RandomState {
520 type Hasher = Hasher;
521
build_hasher(&self) -> Self::Hasher522 fn build_hasher(&self) -> Self::Hasher {
523 self.0.build_hasher()
524 }
525 }
526
527 #[cfg(test)]
528 mod test {
529 use std::collections::HashMap;
530
531 use super::*;
532
533 const _TRAITS: () = {
is_clone<T: Clone>()534 const fn is_clone<T: Clone>() {}
535 is_clone::<RandomState>();
536 };
537
538 #[test]
can_be_used_in_a_hashmap_with_a_random_seed()539 fn can_be_used_in_a_hashmap_with_a_random_seed() {
540 let mut hash: HashMap<_, _, RandomState> = Default::default();
541 hash.insert(42, "the answer");
542 assert_eq!(hash.get(&42), Some(&"the answer"));
543 }
544 }
545 }
546
547 #[cfg(feature = "random")]
548 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
549 pub use random_impl::*;
550
551 #[cfg(feature = "serialize")]
552 #[cfg_attr(docsrs, doc(cfg(feature = "serialize")))]
553 mod serialize_impl {
554 use serde::{Deserialize, Serialize};
555
556 use super::*;
557
558 impl<'de> Deserialize<'de> for Hasher {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,559 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
560 where
561 D: serde::Deserializer<'de>,
562 {
563 let shim = Deserialize::deserialize(deserializer)?;
564
565 let Shim {
566 total_len,
567 seed,
568 core,
569 buffer,
570 buffer_usage,
571 } = shim;
572 let Core { v1, v2, v3, v4 } = core;
573
574 let mut buffer_data = BufferData::new();
575 buffer_data.bytes_mut().copy_from_slice(&buffer);
576
577 Ok(Hasher {
578 seed,
579 accumulators: Accumulators([v1, v2, v3, v4]),
580 buffer: Buffer {
581 offset: buffer_usage,
582 data: buffer_data,
583 },
584 length: total_len,
585 })
586 }
587 }
588
589 impl Serialize for Hasher {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,590 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
591 where
592 S: serde::Serializer,
593 {
594 let Hasher {
595 seed,
596 ref accumulators,
597 ref buffer,
598 length,
599 } = *self;
600 let [v1, v2, v3, v4] = accumulators.0;
601 let Buffer { offset, ref data } = *buffer;
602 let buffer = *data.bytes();
603
604 let shim = Shim {
605 total_len: length,
606 seed,
607 core: Core { v1, v2, v3, v4 },
608 buffer,
609 buffer_usage: offset,
610 };
611
612 shim.serialize(serializer)
613 }
614 }
615
616 #[derive(Serialize, Deserialize)]
617 struct Shim {
618 total_len: u64,
619 seed: u64,
620 core: Core,
621 buffer: [u8; 32],
622 buffer_usage: usize,
623 }
624
625 #[derive(Serialize, Deserialize)]
626 struct Core {
627 v1: u64,
628 v2: u64,
629 v3: u64,
630 v4: u64,
631 }
632
633 #[cfg(test)]
634 mod test {
635 use std::hash::Hasher as _;
636
637 use super::*;
638
639 type Result<T = (), E = serde_json::Error> = core::result::Result<T, E>;
640
641 #[test]
test_serialization_cycle() -> Result642 fn test_serialization_cycle() -> Result {
643 let mut hasher = Hasher::with_seed(0);
644 hasher.write(b"Hello, world!\0");
645 let _ = hasher.finish();
646
647 let serialized = serde_json::to_string(&hasher)?;
648 let unserialized: Hasher = serde_json::from_str(&serialized)?;
649 assert_eq!(hasher, unserialized);
650 Ok(())
651 }
652
653 #[test]
test_serialization_stability() -> Result654 fn test_serialization_stability() -> Result {
655 let mut hasher = Hasher::with_seed(0);
656 hasher.write(b"Hello, world!\0");
657 let _ = hasher.finish();
658
659 let expected_serialized = r#"{
660 "total_len": 14,
661 "seed": 0,
662 "core": {
663 "v1": 6983438078262162902,
664 "v2": 14029467366897019727,
665 "v3": 0,
666 "v4": 7046029288634856825
667 },
668 "buffer": [
669 72, 101, 108, 108, 111, 44, 32, 119,
670 111, 114, 108, 100, 33, 0, 0, 0,
671 0, 0, 0, 0, 0, 0, 0, 0,
672 0, 0, 0, 0, 0, 0, 0, 0
673 ],
674 "buffer_usage": 14
675 }"#;
676
677 let unserialized: Hasher = serde_json::from_str(expected_serialized)?;
678 assert_eq!(hasher, unserialized);
679
680 let expected_value: serde_json::Value = serde_json::from_str(expected_serialized)?;
681 let actual_value = serde_json::to_value(&hasher)?;
682 assert_eq!(expected_value, actual_value);
683
684 Ok(())
685 }
686 }
687 }
688