1 /* Set of hash utility functions to help maintaining the invariant that
2 if a==b then hash(a)==hash(b)
3
4 All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7
8 #ifdef __APPLE__
9 # include <libkern/OSByteOrder.h>
10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11 # include <endian.h>
12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13 # include <sys/endian.h>
14 #endif
15
16 #ifdef __cplusplus
17 extern "C" {
18 #endif
19
20 _Py_HashSecret_t _Py_HashSecret = {{0}};
21
22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23 extern PyHash_FuncDef PyHash_Func;
24 #else
25 static PyHash_FuncDef PyHash_Func;
26 #endif
27
28 /* Count _Py_HashBytes() calls */
29 #ifdef Py_HASH_STATS
30 #define Py_HASH_STATS_MAX 32
31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32 #endif
33
34 /* For numeric types, the hash of a number x is based on the reduction
35 of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
36 hash(x) == hash(y) whenever x and y are numerically equal, even if
37 x and y have different types.
38
39 A quick summary of the hashing strategy:
40
41 (1) First define the 'reduction of x modulo P' for any rational
42 number x; this is a standard extension of the usual notion of
43 reduction modulo P for integers. If x == p/q (written in lowest
44 terms), the reduction is interpreted as the reduction of p times
45 the inverse of the reduction of q, all modulo P; if q is exactly
46 divisible by P then define the reduction to be infinity. So we've
47 got a well-defined map
48
49 reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50
51 (2) Now for a rational number x, define hash(x) by:
52
53 reduce(x) if x >= 0
54 -reduce(-x) if x < 0
55
56 If the result of the reduction is infinity (this is impossible for
57 integers, floats and Decimals) then use the predefined hash value
58 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59 _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60 hashes of float and Decimal infinities and nans.
61
62 A selling point for the above strategy is that it makes it possible
63 to compute hashes of decimal and binary floating-point numbers
64 efficiently, even if the exponent of the binary or decimal number
65 is large. The key point is that
66
67 reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
68
69 provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
70 binary or decimal float is never infinity, since the denominator is a power
71 of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
72 for nonnegative x,
73
74 reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
75
76 reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
77
78 and reduce(10**e) can be computed efficiently by the usual modular
79 exponentiation algorithm. For reduce(2**e) it's even better: since
80 P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
81 by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
82
83 */
84
85 Py_hash_t
_Py_HashDouble(double v)86 _Py_HashDouble(double v)
87 {
88 int e, sign;
89 double m;
90 Py_uhash_t x, y;
91
92 if (!Py_IS_FINITE(v)) {
93 if (Py_IS_INFINITY(v))
94 return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95 else
96 return _PyHASH_NAN;
97 }
98
99 m = frexp(v, &e);
100
101 sign = 1;
102 if (m < 0) {
103 sign = -1;
104 m = -m;
105 }
106
107 /* process 28 bits at a time; this should work well both for binary
108 and hexadecimal floating point. */
109 x = 0;
110 while (m) {
111 x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
112 m *= 268435456.0; /* 2**28 */
113 e -= 28;
114 y = (Py_uhash_t)m; /* pull out integer part */
115 m -= y;
116 x += y;
117 if (x >= _PyHASH_MODULUS)
118 x -= _PyHASH_MODULUS;
119 }
120
121 /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
122 e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
123 x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
124
125 x = x * sign;
126 if (x == (Py_uhash_t)-1)
127 x = (Py_uhash_t)-2;
128 return (Py_hash_t)x;
129 }
130
131 Py_hash_t
_Py_HashPointerRaw(const void * p)132 _Py_HashPointerRaw(const void *p)
133 {
134 size_t y = (size_t)p;
135 /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
136 excessive hash collisions for dicts and sets */
137 y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
138 return (Py_hash_t)y;
139 }
140
141 Py_hash_t
_Py_HashPointer(const void * p)142 _Py_HashPointer(const void *p)
143 {
144 Py_hash_t x = _Py_HashPointerRaw(p);
145 if (x == -1) {
146 x = -2;
147 }
148 return x;
149 }
150
151 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)152 _Py_HashBytes(const void *src, Py_ssize_t len)
153 {
154 Py_hash_t x;
155 /*
156 We make the hash of the empty string be 0, rather than using
157 (prefix ^ suffix), since this slightly obfuscates the hash secret
158 */
159 if (len == 0) {
160 return 0;
161 }
162
163 #ifdef Py_HASH_STATS
164 hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
165 #endif
166
167 #if Py_HASH_CUTOFF > 0
168 if (len < Py_HASH_CUTOFF) {
169 /* Optimize hashing of very small strings with inline DJBX33A. */
170 Py_uhash_t hash;
171 const unsigned char *p = src;
172 hash = 5381; /* DJBX33A starts with 5381 */
173
174 switch(len) {
175 /* ((hash << 5) + hash) + *p == hash * 33 + *p */
176 case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
177 case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
178 case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
179 case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
180 case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
181 case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
182 case 1: hash = ((hash << 5) + hash) + *p++; break;
183 default:
184 Py_UNREACHABLE();
185 }
186 hash ^= len;
187 hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
188 x = (Py_hash_t)hash;
189 }
190 else
191 #endif /* Py_HASH_CUTOFF */
192 x = PyHash_Func.hash(src, len);
193
194 if (x == -1)
195 return -2;
196 return x;
197 }
198
199 void
_PyHash_Fini(void)200 _PyHash_Fini(void)
201 {
202 #ifdef Py_HASH_STATS
203 int i;
204 Py_ssize_t total = 0;
205 const char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";
206
207 fprintf(stderr, "len calls total\n");
208 for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
209 total += hashstats[i];
210 fprintf(stderr, fmt, i, hashstats[i], total);
211 }
212 total += hashstats[0];
213 fprintf(stderr, "> %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
214 hashstats[0], total);
215 #endif
216 }
217
218 PyHash_FuncDef *
PyHash_GetFuncDef(void)219 PyHash_GetFuncDef(void)
220 {
221 return &PyHash_Func;
222 }
223
224 /* Optimized memcpy() for Windows */
225 #ifdef _MSC_VER
226 # if SIZEOF_PY_UHASH_T == 4
227 # define PY_UHASH_CPY(dst, src) do { \
228 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
229 } while(0)
230 # elif SIZEOF_PY_UHASH_T == 8
231 # define PY_UHASH_CPY(dst, src) do { \
232 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
233 dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
234 } while(0)
235 # else
236 # error SIZEOF_PY_UHASH_T must be 4 or 8
237 # endif /* SIZEOF_PY_UHASH_T */
238 #else /* not Windows */
239 # define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
240 #endif /* _MSC_VER */
241
242
243 #if Py_HASH_ALGORITHM == Py_HASH_FNV
244 /* **************************************************************************
245 * Modified Fowler-Noll-Vo (FNV) hash function
246 */
247 static Py_hash_t
fnv(const void * src,Py_ssize_t len)248 fnv(const void *src, Py_ssize_t len)
249 {
250 const unsigned char *p = src;
251 Py_uhash_t x;
252 Py_ssize_t remainder, blocks;
253 union {
254 Py_uhash_t value;
255 unsigned char bytes[SIZEOF_PY_UHASH_T];
256 } block;
257
258 #ifdef Py_DEBUG
259 assert(_Py_HashSecret_Initialized);
260 #endif
261 remainder = len % SIZEOF_PY_UHASH_T;
262 if (remainder == 0) {
263 /* Process at least one block byte by byte to reduce hash collisions
264 * for strings with common prefixes. */
265 remainder = SIZEOF_PY_UHASH_T;
266 }
267 blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
268
269 x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
270 x ^= (Py_uhash_t) *p << 7;
271 while (blocks--) {
272 PY_UHASH_CPY(block.bytes, p);
273 x = (_PyHASH_MULTIPLIER * x) ^ block.value;
274 p += SIZEOF_PY_UHASH_T;
275 }
276 /* add remainder */
277 for (; remainder > 0; remainder--)
278 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
279 x ^= (Py_uhash_t) len;
280 x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
281 if (x == (Py_uhash_t) -1) {
282 x = (Py_uhash_t) -2;
283 }
284 return x;
285 }
286
287 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
288 16 * SIZEOF_PY_HASH_T};
289
290 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
291
292
293 /* **************************************************************************
294 <MIT License>
295 Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
296
297 Permission is hereby granted, free of charge, to any person obtaining a copy
298 of this software and associated documentation files (the "Software"), to deal
299 in the Software without restriction, including without limitation the rights
300 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
301 copies of the Software, and to permit persons to whom the Software is
302 furnished to do so, subject to the following conditions:
303
304 The above copyright notice and this permission notice shall be included in
305 all copies or substantial portions of the Software.
306
307 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
308 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
309 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
310 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
311 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
312 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
313 THE SOFTWARE.
314 </MIT License>
315
316 Original location:
317 https://github.com/majek/csiphash/
318
319 Solution inspired by code from:
320 Samuel Neves (supercop/crypto_auth/siphash24/little)
321 djb (supercop/crypto_auth/siphash24/little2)
322 Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
323
324 Modified for Python by Christian Heimes:
325 - C89 / MSVC compatibility
326 - _rotl64() on Windows
327 - letoh64() fallback
328 */
329
330 /* byte swap little endian to host endian
331 * Endian conversion not only ensures that the hash function returns the same
332 * value on all platforms. It is also required to for a good dispersion of
333 * the hash values' least significant bits.
334 */
335 #if PY_LITTLE_ENDIAN
336 # define _le64toh(x) ((uint64_t)(x))
337 #elif defined(__APPLE__)
338 # define _le64toh(x) OSSwapLittleToHostInt64(x)
339 #elif defined(HAVE_LETOH64)
340 # define _le64toh(x) le64toh(x)
341 #else
342 # define _le64toh(x) (((uint64_t)(x) << 56) | \
343 (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
344 (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
345 (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
346 (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
347 (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
348 (((uint64_t)(x) >> 40) & 0xff00ULL) | \
349 ((uint64_t)(x) >> 56))
350 #endif
351
352
353 #ifdef _MSC_VER
354 # define ROTATE(x, b) _rotl64(x, b)
355 #else
356 # define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
357 #endif
358
359 #define HALF_ROUND(a,b,c,d,s,t) \
360 a += b; c += d; \
361 b = ROTATE(b, s) ^ a; \
362 d = ROTATE(d, t) ^ c; \
363 a = ROTATE(a, 32);
364
365 #define DOUBLE_ROUND(v0,v1,v2,v3) \
366 HALF_ROUND(v0,v1,v2,v3,13,16); \
367 HALF_ROUND(v2,v1,v0,v3,17,21); \
368 HALF_ROUND(v0,v1,v2,v3,13,16); \
369 HALF_ROUND(v2,v1,v0,v3,17,21);
370
371
372 static uint64_t
siphash24(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)373 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
374 uint64_t b = (uint64_t)src_sz << 56;
375 const uint8_t *in = (const uint8_t*)src;
376
377 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
378 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
379 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
380 uint64_t v3 = k1 ^ 0x7465646279746573ULL;
381
382 uint64_t t;
383 uint8_t *pt;
384
385 while (src_sz >= 8) {
386 uint64_t mi;
387 memcpy(&mi, in, sizeof(mi));
388 mi = _le64toh(mi);
389 in += sizeof(mi);
390 src_sz -= sizeof(mi);
391 v3 ^= mi;
392 DOUBLE_ROUND(v0,v1,v2,v3);
393 v0 ^= mi;
394 }
395
396 t = 0;
397 pt = (uint8_t *)&t;
398 switch (src_sz) {
399 case 7: pt[6] = in[6]; /* fall through */
400 case 6: pt[5] = in[5]; /* fall through */
401 case 5: pt[4] = in[4]; /* fall through */
402 case 4: memcpy(pt, in, sizeof(uint32_t)); break;
403 case 3: pt[2] = in[2]; /* fall through */
404 case 2: pt[1] = in[1]; /* fall through */
405 case 1: pt[0] = in[0]; /* fall through */
406 }
407 b |= _le64toh(t);
408
409 v3 ^= b;
410 DOUBLE_ROUND(v0,v1,v2,v3);
411 v0 ^= b;
412 v2 ^= 0xff;
413 DOUBLE_ROUND(v0,v1,v2,v3);
414 DOUBLE_ROUND(v0,v1,v2,v3);
415
416 /* modified */
417 t = (v0 ^ v1) ^ (v2 ^ v3);
418 return t;
419 }
420
421 uint64_t
_Py_KeyedHash(uint64_t key,const void * src,Py_ssize_t src_sz)422 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
423 {
424 return siphash24(key, 0, src, src_sz);
425 }
426
427
428 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
429 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)430 pysiphash(const void *src, Py_ssize_t src_sz) {
431 return (Py_hash_t)siphash24(
432 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
433 src, src_sz);
434 }
435
436 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
437 #endif
438
439 #ifdef __cplusplus
440 }
441 #endif
442