1 /* Set of hash utility functions to help maintaining the invariant that
2 if a==b then hash(a)==hash(b)
3
4 All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7 #include "pycore_pyhash.h" // _Py_HashSecret_t
8
9 #ifdef __APPLE__
10 # include <libkern/OSByteOrder.h>
11 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
12 # include <endian.h>
13 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
14 # include <sys/endian.h>
15 #endif
16
17 _Py_HashSecret_t _Py_HashSecret = {{0}};
18
19 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
20 extern PyHash_FuncDef PyHash_Func;
21 #else
22 static PyHash_FuncDef PyHash_Func;
23 #endif
24
25 /* Count _Py_HashBytes() calls */
26 #ifdef Py_HASH_STATS
27 #define Py_HASH_STATS_MAX 32
28 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
29 #endif
30
31 /* For numeric types, the hash of a number x is based on the reduction
32 of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
33 hash(x) == hash(y) whenever x and y are numerically equal, even if
34 x and y have different types.
35
36 A quick summary of the hashing strategy:
37
38 (1) First define the 'reduction of x modulo P' for any rational
39 number x; this is a standard extension of the usual notion of
40 reduction modulo P for integers. If x == p/q (written in lowest
41 terms), the reduction is interpreted as the reduction of p times
42 the inverse of the reduction of q, all modulo P; if q is exactly
43 divisible by P then define the reduction to be infinity. So we've
44 got a well-defined map
45
46 reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
47
48 (2) Now for a rational number x, define hash(x) by:
49
50 reduce(x) if x >= 0
51 -reduce(-x) if x < 0
52
53 If the result of the reduction is infinity (this is impossible for
54 integers, floats and Decimals) then use the predefined hash value
55 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
56 _PyHASH_INF and -_PyHASH_INF are also used for the
57 hashes of float and Decimal infinities.
58
59 NaNs hash with a pointer hash. Having distinct hash values prevents
60 catastrophic pileups from distinct NaN instances which used to always
61 have the same hash value but would compare unequal.
62
63 A selling point for the above strategy is that it makes it possible
64 to compute hashes of decimal and binary floating-point numbers
65 efficiently, even if the exponent of the binary or decimal number
66 is large. The key point is that
67
68 reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
69
70 provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
71 binary or decimal float is never infinity, since the denominator is a power
72 of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
73 for nonnegative x,
74
75 reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
76
77 reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
78
79 and reduce(10**e) can be computed efficiently by the usual modular
80 exponentiation algorithm. For reduce(2**e) it's even better: since
81 P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
82 by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
83
84 */
85
86 Py_hash_t
_Py_HashDouble(PyObject * inst,double v)87 _Py_HashDouble(PyObject *inst, double v)
88 {
89 int e, sign;
90 double m;
91 Py_uhash_t x, y;
92
93 if (!Py_IS_FINITE(v)) {
94 if (Py_IS_INFINITY(v))
95 return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
96 else
97 return PyObject_GenericHash(inst);
98 }
99
100 m = frexp(v, &e);
101
102 sign = 1;
103 if (m < 0) {
104 sign = -1;
105 m = -m;
106 }
107
108 /* process 28 bits at a time; this should work well both for binary
109 and hexadecimal floating point. */
110 x = 0;
111 while (m) {
112 x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
113 m *= 268435456.0; /* 2**28 */
114 e -= 28;
115 y = (Py_uhash_t)m; /* pull out integer part */
116 m -= y;
117 x += y;
118 if (x >= _PyHASH_MODULUS)
119 x -= _PyHASH_MODULUS;
120 }
121
122 /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
123 e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
124 x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
125
126 x = x * sign;
127 if (x == (Py_uhash_t)-1)
128 x = (Py_uhash_t)-2;
129 return (Py_hash_t)x;
130 }
131
132 Py_hash_t
Py_HashPointer(const void * ptr)133 Py_HashPointer(const void *ptr)
134 {
135 Py_hash_t hash = _Py_HashPointerRaw(ptr);
136 if (hash == -1) {
137 hash = -2;
138 }
139 return hash;
140 }
141
142 Py_hash_t
PyObject_GenericHash(PyObject * obj)143 PyObject_GenericHash(PyObject *obj)
144 {
145 return Py_HashPointer(obj);
146 }
147
148 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)149 _Py_HashBytes(const void *src, Py_ssize_t len)
150 {
151 Py_hash_t x;
152 /*
153 We make the hash of the empty string be 0, rather than using
154 (prefix ^ suffix), since this slightly obfuscates the hash secret
155 */
156 if (len == 0) {
157 return 0;
158 }
159
160 #ifdef Py_HASH_STATS
161 hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
162 #endif
163
164 #if Py_HASH_CUTOFF > 0
165 if (len < Py_HASH_CUTOFF) {
166 /* Optimize hashing of very small strings with inline DJBX33A. */
167 Py_uhash_t hash;
168 const unsigned char *p = src;
169 hash = 5381; /* DJBX33A starts with 5381 */
170
171 switch(len) {
172 /* ((hash << 5) + hash) + *p == hash * 33 + *p */
173 case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
174 case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
175 case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
176 case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
177 case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
178 case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
179 case 1: hash = ((hash << 5) + hash) + *p++; break;
180 default:
181 Py_UNREACHABLE();
182 }
183 hash ^= len;
184 hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
185 x = (Py_hash_t)hash;
186 }
187 else
188 #endif /* Py_HASH_CUTOFF */
189 x = PyHash_Func.hash(src, len);
190
191 if (x == -1)
192 return -2;
193 return x;
194 }
195
196 void
_PyHash_Fini(void)197 _PyHash_Fini(void)
198 {
199 #ifdef Py_HASH_STATS
200 fprintf(stderr, "len calls total\n");
201 Py_ssize_t total = 0;
202 for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
203 total += hashstats[i];
204 fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
205 }
206 total += hashstats[0];
207 fprintf(stderr, "> %8zd %8zd\n", hashstats[0], total);
208 #endif
209 }
210
211 PyHash_FuncDef *
PyHash_GetFuncDef(void)212 PyHash_GetFuncDef(void)
213 {
214 return &PyHash_Func;
215 }
216
217 /* Optimized memcpy() for Windows */
218 #ifdef _MSC_VER
219 # if SIZEOF_PY_UHASH_T == 4
220 # define PY_UHASH_CPY(dst, src) do { \
221 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
222 } while(0)
223 # elif SIZEOF_PY_UHASH_T == 8
224 # define PY_UHASH_CPY(dst, src) do { \
225 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
226 dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
227 } while(0)
228 # else
229 # error SIZEOF_PY_UHASH_T must be 4 or 8
230 # endif /* SIZEOF_PY_UHASH_T */
231 #else /* not Windows */
232 # define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
233 #endif /* _MSC_VER */
234
235
236 #if Py_HASH_ALGORITHM == Py_HASH_FNV
237 /* **************************************************************************
238 * Modified Fowler-Noll-Vo (FNV) hash function
239 */
240 static Py_hash_t
fnv(const void * src,Py_ssize_t len)241 fnv(const void *src, Py_ssize_t len)
242 {
243 const unsigned char *p = src;
244 Py_uhash_t x;
245 Py_ssize_t remainder, blocks;
246 union {
247 Py_uhash_t value;
248 unsigned char bytes[SIZEOF_PY_UHASH_T];
249 } block;
250
251 #ifdef Py_DEBUG
252 assert(_Py_HashSecret_Initialized);
253 #endif
254 remainder = len % SIZEOF_PY_UHASH_T;
255 if (remainder == 0) {
256 /* Process at least one block byte by byte to reduce hash collisions
257 * for strings with common prefixes. */
258 remainder = SIZEOF_PY_UHASH_T;
259 }
260 blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
261
262 x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
263 x ^= (Py_uhash_t) *p << 7;
264 while (blocks--) {
265 PY_UHASH_CPY(block.bytes, p);
266 x = (PyHASH_MULTIPLIER * x) ^ block.value;
267 p += SIZEOF_PY_UHASH_T;
268 }
269 /* add remainder */
270 for (; remainder > 0; remainder--)
271 x = (PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
272 x ^= (Py_uhash_t) len;
273 x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
274 if (x == (Py_uhash_t) -1) {
275 x = (Py_uhash_t) -2;
276 }
277 return x;
278 }
279
280 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
281 16 * SIZEOF_PY_HASH_T};
282
283 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
284
285
286 /* **************************************************************************
287 <MIT License>
288 Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
289
290 Permission is hereby granted, free of charge, to any person obtaining a copy
291 of this software and associated documentation files (the "Software"), to deal
292 in the Software without restriction, including without limitation the rights
293 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
294 copies of the Software, and to permit persons to whom the Software is
295 furnished to do so, subject to the following conditions:
296
297 The above copyright notice and this permission notice shall be included in
298 all copies or substantial portions of the Software.
299
300 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
301 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
302 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
303 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
304 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
305 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
306 THE SOFTWARE.
307 </MIT License>
308
309 Original location:
310 https://github.com/majek/csiphash/
311
312 Solution inspired by code from:
313 Samuel Neves (supercop/crypto_auth/siphash24/little)
314 djb (supercop/crypto_auth/siphash24/little2)
315 Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
316
317 Modified for Python by Christian Heimes:
318 - C89 / MSVC compatibility
319 - _rotl64() on Windows
320 - letoh64() fallback
321 */
322
323 /* byte swap little endian to host endian
324 * Endian conversion not only ensures that the hash function returns the same
325 * value on all platforms. It is also required to for a good dispersion of
326 * the hash values' least significant bits.
327 */
328 #if PY_LITTLE_ENDIAN
329 # define _le64toh(x) ((uint64_t)(x))
330 #elif defined(__APPLE__)
331 # define _le64toh(x) OSSwapLittleToHostInt64(x)
332 #elif defined(HAVE_LETOH64)
333 # define _le64toh(x) le64toh(x)
334 #else
335 # define _le64toh(x) (((uint64_t)(x) << 56) | \
336 (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
337 (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
338 (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
339 (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
340 (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
341 (((uint64_t)(x) >> 40) & 0xff00ULL) | \
342 ((uint64_t)(x) >> 56))
343 #endif
344
345
346 #ifdef _MSC_VER
347 # define ROTATE(x, b) _rotl64(x, b)
348 #else
349 # define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
350 #endif
351
352 #define HALF_ROUND(a,b,c,d,s,t) \
353 a += b; c += d; \
354 b = ROTATE(b, s) ^ a; \
355 d = ROTATE(d, t) ^ c; \
356 a = ROTATE(a, 32);
357
358 #define SINGLE_ROUND(v0,v1,v2,v3) \
359 HALF_ROUND(v0,v1,v2,v3,13,16); \
360 HALF_ROUND(v2,v1,v0,v3,17,21);
361
362 #define DOUBLE_ROUND(v0,v1,v2,v3) \
363 SINGLE_ROUND(v0,v1,v2,v3); \
364 SINGLE_ROUND(v0,v1,v2,v3);
365
366
367 static uint64_t
siphash13(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)368 siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
369 uint64_t b = (uint64_t)src_sz << 56;
370 const uint8_t *in = (const uint8_t*)src;
371
372 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
373 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
374 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
375 uint64_t v3 = k1 ^ 0x7465646279746573ULL;
376
377 uint64_t t;
378 uint8_t *pt;
379
380 while (src_sz >= 8) {
381 uint64_t mi;
382 memcpy(&mi, in, sizeof(mi));
383 mi = _le64toh(mi);
384 in += sizeof(mi);
385 src_sz -= sizeof(mi);
386 v3 ^= mi;
387 SINGLE_ROUND(v0,v1,v2,v3);
388 v0 ^= mi;
389 }
390
391 t = 0;
392 pt = (uint8_t *)&t;
393 switch (src_sz) {
394 case 7: pt[6] = in[6]; /* fall through */
395 case 6: pt[5] = in[5]; /* fall through */
396 case 5: pt[4] = in[4]; /* fall through */
397 case 4: memcpy(pt, in, sizeof(uint32_t)); break;
398 case 3: pt[2] = in[2]; /* fall through */
399 case 2: pt[1] = in[1]; /* fall through */
400 case 1: pt[0] = in[0]; /* fall through */
401 }
402 b |= _le64toh(t);
403
404 v3 ^= b;
405 SINGLE_ROUND(v0,v1,v2,v3);
406 v0 ^= b;
407 v2 ^= 0xff;
408 SINGLE_ROUND(v0,v1,v2,v3);
409 SINGLE_ROUND(v0,v1,v2,v3);
410 SINGLE_ROUND(v0,v1,v2,v3);
411
412 /* modified */
413 t = (v0 ^ v1) ^ (v2 ^ v3);
414 return t;
415 }
416
417 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
418 static uint64_t
siphash24(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)419 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
420 uint64_t b = (uint64_t)src_sz << 56;
421 const uint8_t *in = (const uint8_t*)src;
422
423 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
424 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
425 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
426 uint64_t v3 = k1 ^ 0x7465646279746573ULL;
427
428 uint64_t t;
429 uint8_t *pt;
430
431 while (src_sz >= 8) {
432 uint64_t mi;
433 memcpy(&mi, in, sizeof(mi));
434 mi = _le64toh(mi);
435 in += sizeof(mi);
436 src_sz -= sizeof(mi);
437 v3 ^= mi;
438 DOUBLE_ROUND(v0,v1,v2,v3);
439 v0 ^= mi;
440 }
441
442 t = 0;
443 pt = (uint8_t *)&t;
444 switch (src_sz) {
445 case 7: pt[6] = in[6]; /* fall through */
446 case 6: pt[5] = in[5]; /* fall through */
447 case 5: pt[4] = in[4]; /* fall through */
448 case 4: memcpy(pt, in, sizeof(uint32_t)); break;
449 case 3: pt[2] = in[2]; /* fall through */
450 case 2: pt[1] = in[1]; /* fall through */
451 case 1: pt[0] = in[0]; /* fall through */
452 }
453 b |= _le64toh(t);
454
455 v3 ^= b;
456 DOUBLE_ROUND(v0,v1,v2,v3);
457 v0 ^= b;
458 v2 ^= 0xff;
459 DOUBLE_ROUND(v0,v1,v2,v3);
460 DOUBLE_ROUND(v0,v1,v2,v3);
461
462 /* modified */
463 t = (v0 ^ v1) ^ (v2 ^ v3);
464 return t;
465 }
466 #endif
467
468 uint64_t
_Py_KeyedHash(uint64_t key,const void * src,Py_ssize_t src_sz)469 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
470 {
471 return siphash13(key, 0, src, src_sz);
472 }
473
474
475 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
476 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)477 pysiphash(const void *src, Py_ssize_t src_sz) {
478 return (Py_hash_t)siphash13(
479 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
480 src, src_sz);
481 }
482
483 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
484 #endif
485
486 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
487 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)488 pysiphash(const void *src, Py_ssize_t src_sz) {
489 return (Py_hash_t)siphash24(
490 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
491 src, src_sz);
492 }
493
494 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
495 #endif
496