• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- Portable string hash function ---------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_SRC___SUPPORT_HASH_H
10 #define LLVM_LIBC_SRC___SUPPORT_HASH_H
11 
12 #include "src/__support/CPP/bit.h"           // rotl
13 #include "src/__support/CPP/limits.h"        // numeric_limits
14 #include "src/__support/macros/attributes.h" // LIBC_INLINE
15 #include "src/__support/uint128.h"           // UInt128
16 #include <stdint.h>                          // For uint64_t
17 
18 namespace LIBC_NAMESPACE {
19 namespace internal {
20 
21 // Folded multiplication.
22 // This function multiplies two 64-bit integers and xor the high and
23 // low 64-bit parts of the result.
folded_multiply(uint64_t x,uint64_t y)24 LIBC_INLINE uint64_t folded_multiply(uint64_t x, uint64_t y) {
25   UInt128 p = static_cast<UInt128>(x) * static_cast<UInt128>(y);
26   uint64_t low = static_cast<uint64_t>(p);
27   uint64_t high = static_cast<uint64_t>(p >> 64);
28   return low ^ high;
29 }
30 
31 // Read as little endian.
32 // Shift-and-or implementation does not give a satisfactory code on aarch64.
33 // Therefore, we use a union to read the value.
read_little_endian(const void * ptr)34 template <typename T> LIBC_INLINE T read_little_endian(const void *ptr) {
35   const uint8_t *bytes = static_cast<const uint8_t *>(ptr);
36   union {
37     T value;
38     uint8_t buffer[sizeof(T)];
39   } data;
40 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
41   // Compiler should able to optimize this as a load followed by a byte swap.
42   // On aarch64 (-mbig-endian), this compiles to the following for int:
43   //      ldr     w0, [x0]
44   //      rev     w0, w0
45   //      ret
46   for (size_t i = 0; i < sizeof(T); ++i) {
47     data.buffer[i] = bytes[sizeof(T) - i - 1];
48   }
49 #else
50   for (size_t i = 0; i < sizeof(T); ++i) {
51     data.buffer[i] = bytes[i];
52   }
53 #endif
54   return data.value;
55 }
56 
57 // Specialized read functions for small values. size must be <= 8.
read_small_values(const void * ptr,size_t size,uint64_t & low,uint64_t & high)58 LIBC_INLINE void read_small_values(const void *ptr, size_t size, uint64_t &low,
59                                    uint64_t &high) {
60   const uint8_t *bytes = static_cast<const uint8_t *>(ptr);
61   if (size >= 2) {
62     if (size >= 4) {
63       low = static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[0]));
64       high =
65           static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[size - 4]));
66     } else {
67       low = static_cast<uint64_t>(read_little_endian<uint16_t>(&bytes[0]));
68       high = static_cast<uint64_t>(bytes[size - 1]);
69     }
70   } else {
71     if (size > 0) {
72       low = static_cast<uint64_t>(bytes[0]);
73       high = static_cast<uint64_t>(bytes[0]);
74     } else {
75       low = 0;
76       high = 0;
77     }
78   }
79 }
80 
81 // This constant comes from Kunth's prng (it empirically works well).
82 LIBC_INLINE_VAR constexpr uint64_t MULTIPLE = 6364136223846793005;
83 // Rotation amount for mixing.
84 LIBC_INLINE_VAR constexpr uint64_t ROTATE = 23;
85 
86 // Randomly generated values. For now, we use the same values as in aHash as
87 // they are widely tested.
88 // https://github.com/tkaitchuck/aHash/blob/9f6a2ad8b721fd28da8dc1d0b7996677b374357c/src/random_state.rs#L38
89 LIBC_INLINE_VAR constexpr uint64_t RANDOMNESS[2][4] = {
90     {0x243f6a8885a308d3, 0x13198a2e03707344, 0xa4093822299f31d0,
91      0x082efa98ec4e6c89},
92     {0x452821e638d01377, 0xbe5466cf34e90c6c, 0xc0ac29b7c97c50dd,
93      0x3f84d5b5b5470917},
94 };
95 
96 // This is a portable string hasher. It is not cryptographically secure.
97 // The quality of the hash is good enough to pass all tests in SMHasher.
98 // The implementation is derived from the generic routine of aHash.
99 class HashState {
100   uint64_t buffer;
101   uint64_t pad;
102   uint64_t extra_keys[2];
update(uint64_t low,uint64_t high)103   LIBC_INLINE void update(uint64_t low, uint64_t high) {
104     uint64_t combined =
105         folded_multiply(low ^ extra_keys[0], high ^ extra_keys[1]);
106     buffer = (buffer + pad) ^ combined;
107     buffer = cpp::rotl(buffer, ROTATE);
108   }
mix(uint64_t seed)109   LIBC_INLINE static uint64_t mix(uint64_t seed) {
110     HashState mixer{RANDOMNESS[0][0], RANDOMNESS[0][1], RANDOMNESS[0][2],
111                     RANDOMNESS[0][3]};
112     mixer.update(seed, 0);
113     return mixer.finish();
114   }
115 
116 public:
HashState(uint64_t a,uint64_t b,uint64_t c,uint64_t d)117   LIBC_INLINE constexpr HashState(uint64_t a, uint64_t b, uint64_t c,
118                                   uint64_t d)
119       : buffer(a), pad(b), extra_keys{c, d} {}
HashState(uint64_t seed)120   LIBC_INLINE HashState(uint64_t seed) {
121     // Mix one more round of the seed to make it stronger.
122     uint64_t mixed = mix(seed);
123     buffer = RANDOMNESS[1][0] ^ mixed;
124     pad = RANDOMNESS[1][1] ^ mixed;
125     extra_keys[0] = RANDOMNESS[1][2] ^ mixed;
126     extra_keys[1] = RANDOMNESS[1][3] ^ mixed;
127   }
update(const void * ptr,size_t size)128   LIBC_INLINE void update(const void *ptr, size_t size) {
129     uint8_t const *bytes = static_cast<const uint8_t *>(ptr);
130     buffer = (buffer + size) * MULTIPLE;
131     uint64_t low, high;
132     if (size > 8) {
133       if (size > 16) {
134         // update tail
135         low = read_little_endian<uint64_t>(&bytes[size - 16]);
136         high = read_little_endian<uint64_t>(&bytes[size - 8]);
137         update(low, high);
138         while (size > 16) {
139           low = read_little_endian<uint64_t>(&bytes[0]);
140           high = read_little_endian<uint64_t>(&bytes[8]);
141           update(low, high);
142           bytes += 16;
143           size -= 16;
144         }
145       } else {
146         low = read_little_endian<uint64_t>(&bytes[0]);
147         high = read_little_endian<uint64_t>(&bytes[size - 8]);
148         update(low, high);
149       }
150     } else {
151       read_small_values(ptr, size, low, high);
152       update(low, high);
153     }
154   }
finish()155   LIBC_INLINE uint64_t finish() {
156     int rot = buffer & 63;
157     uint64_t folded = folded_multiply(buffer, pad);
158     return cpp::rotl(folded, rot);
159   }
160 };
161 
162 } // namespace internal
163 } // namespace LIBC_NAMESPACE
164 
165 #endif // LLVM_LIBC_SRC___SUPPORT_HASH_H
166