• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Set of hash utility functions to help maintaining the invariant that
2     if a==b then hash(a)==hash(b)
3 
4    All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7 #include "pycore_pyhash.h"        // _Py_HashSecret_t
8 
9 #ifdef __APPLE__
10 #  include <libkern/OSByteOrder.h>
11 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
12 #  include <endian.h>
13 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
14 #  include <sys/endian.h>
15 #endif
16 
17 _Py_HashSecret_t _Py_HashSecret = {{0}};
18 
19 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
20 extern PyHash_FuncDef PyHash_Func;
21 #else
22 static PyHash_FuncDef PyHash_Func;
23 #endif
24 
25 /* Count _Py_HashBytes() calls */
26 #ifdef Py_HASH_STATS
27 #define Py_HASH_STATS_MAX 32
28 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
29 #endif
30 
31 /* For numeric types, the hash of a number x is based on the reduction
32    of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
33    hash(x) == hash(y) whenever x and y are numerically equal, even if
34    x and y have different types.
35 
36    A quick summary of the hashing strategy:
37 
38    (1) First define the 'reduction of x modulo P' for any rational
39    number x; this is a standard extension of the usual notion of
40    reduction modulo P for integers.  If x == p/q (written in lowest
41    terms), the reduction is interpreted as the reduction of p times
42    the inverse of the reduction of q, all modulo P; if q is exactly
43    divisible by P then define the reduction to be infinity.  So we've
44    got a well-defined map
45 
46       reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
47 
48    (2) Now for a rational number x, define hash(x) by:
49 
50       reduce(x)   if x >= 0
51       -reduce(-x) if x < 0
52 
53    If the result of the reduction is infinity (this is impossible for
54    integers, floats and Decimals) then use the predefined hash value
55    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
56    _PyHASH_INF and -_PyHASH_INF are also used for the
57    hashes of float and Decimal infinities.
58 
59    NaNs hash with a pointer hash.  Having distinct hash values prevents
60    catastrophic pileups from distinct NaN instances which used to always
61    have the same hash value but would compare unequal.
62 
63    A selling point for the above strategy is that it makes it possible
64    to compute hashes of decimal and binary floating-point numbers
65    efficiently, even if the exponent of the binary or decimal number
66    is large.  The key point is that
67 
68       reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
69 
70    provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
71    binary or decimal float is never infinity, since the denominator is a power
72    of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
73    for nonnegative x,
74 
75       reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
76 
77       reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
78 
79    and reduce(10**e) can be computed efficiently by the usual modular
80    exponentiation algorithm.  For reduce(2**e) it's even better: since
81    P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
82    by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
83 
84    */
85 
86 Py_hash_t
_Py_HashDouble(PyObject * inst,double v)87 _Py_HashDouble(PyObject *inst, double v)
88 {
89     int e, sign;
90     double m;
91     Py_uhash_t x, y;
92 
93     if (!Py_IS_FINITE(v)) {
94         if (Py_IS_INFINITY(v))
95             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
96         else
97             return PyObject_GenericHash(inst);
98     }
99 
100     m = frexp(v, &e);
101 
102     sign = 1;
103     if (m < 0) {
104         sign = -1;
105         m = -m;
106     }
107 
108     /* process 28 bits at a time;  this should work well both for binary
109        and hexadecimal floating point. */
110     x = 0;
111     while (m) {
112         x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
113         m *= 268435456.0;  /* 2**28 */
114         e -= 28;
115         y = (Py_uhash_t)m;  /* pull out integer part */
116         m -= y;
117         x += y;
118         if (x >= _PyHASH_MODULUS)
119             x -= _PyHASH_MODULUS;
120     }
121 
122     /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
123     e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
124     x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
125 
126     x = x * sign;
127     if (x == (Py_uhash_t)-1)
128         x = (Py_uhash_t)-2;
129     return (Py_hash_t)x;
130 }
131 
132 Py_hash_t
Py_HashPointer(const void * ptr)133 Py_HashPointer(const void *ptr)
134 {
135     Py_hash_t hash = _Py_HashPointerRaw(ptr);
136     if (hash == -1) {
137         hash = -2;
138     }
139     return hash;
140 }
141 
142 Py_hash_t
PyObject_GenericHash(PyObject * obj)143 PyObject_GenericHash(PyObject *obj)
144 {
145     return Py_HashPointer(obj);
146 }
147 
148 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)149 _Py_HashBytes(const void *src, Py_ssize_t len)
150 {
151     Py_hash_t x;
152     /*
153       We make the hash of the empty string be 0, rather than using
154       (prefix ^ suffix), since this slightly obfuscates the hash secret
155     */
156     if (len == 0) {
157         return 0;
158     }
159 
160 #ifdef Py_HASH_STATS
161     hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
162 #endif
163 
164 #if Py_HASH_CUTOFF > 0
165     if (len < Py_HASH_CUTOFF) {
166         /* Optimize hashing of very small strings with inline DJBX33A. */
167         Py_uhash_t hash;
168         const unsigned char *p = src;
169         hash = 5381; /* DJBX33A starts with 5381 */
170 
171         switch(len) {
172             /* ((hash << 5) + hash) + *p == hash * 33 + *p */
173             case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
174             case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
175             case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
176             case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
177             case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
178             case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
179             case 1: hash = ((hash << 5) + hash) + *p++; break;
180             default:
181                 Py_UNREACHABLE();
182         }
183         hash ^= len;
184         hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
185         x = (Py_hash_t)hash;
186     }
187     else
188 #endif /* Py_HASH_CUTOFF */
189         x = PyHash_Func.hash(src, len);
190 
191     if (x == -1)
192         return -2;
193     return x;
194 }
195 
196 void
_PyHash_Fini(void)197 _PyHash_Fini(void)
198 {
199 #ifdef Py_HASH_STATS
200     fprintf(stderr, "len   calls    total\n");
201     Py_ssize_t total = 0;
202     for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
203         total += hashstats[i];
204         fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
205     }
206     total += hashstats[0];
207     fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
208 #endif
209 }
210 
211 PyHash_FuncDef *
PyHash_GetFuncDef(void)212 PyHash_GetFuncDef(void)
213 {
214     return &PyHash_Func;
215 }
216 
217 /* Optimized memcpy() for Windows */
218 #ifdef _MSC_VER
219 #  if SIZEOF_PY_UHASH_T == 4
220 #    define PY_UHASH_CPY(dst, src) do {                                    \
221        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
222        } while(0)
223 #  elif SIZEOF_PY_UHASH_T == 8
224 #    define PY_UHASH_CPY(dst, src) do {                                    \
225        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
226        dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
227        } while(0)
228 #  else
229 #    error SIZEOF_PY_UHASH_T must be 4 or 8
230 #  endif /* SIZEOF_PY_UHASH_T */
231 #else /* not Windows */
232 #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
233 #endif /* _MSC_VER */
234 
235 
236 #if Py_HASH_ALGORITHM == Py_HASH_FNV
237 /* **************************************************************************
238  * Modified Fowler-Noll-Vo (FNV) hash function
239  */
240 static Py_hash_t
fnv(const void * src,Py_ssize_t len)241 fnv(const void *src, Py_ssize_t len)
242 {
243     const unsigned char *p = src;
244     Py_uhash_t x;
245     Py_ssize_t remainder, blocks;
246     union {
247         Py_uhash_t value;
248         unsigned char bytes[SIZEOF_PY_UHASH_T];
249     } block;
250 
251 #ifdef Py_DEBUG
252     assert(_Py_HashSecret_Initialized);
253 #endif
254     remainder = len % SIZEOF_PY_UHASH_T;
255     if (remainder == 0) {
256         /* Process at least one block byte by byte to reduce hash collisions
257          * for strings with common prefixes. */
258         remainder = SIZEOF_PY_UHASH_T;
259     }
260     blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
261 
262     x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
263     x ^= (Py_uhash_t) *p << 7;
264     while (blocks--) {
265         PY_UHASH_CPY(block.bytes, p);
266         x = (PyHASH_MULTIPLIER * x) ^ block.value;
267         p += SIZEOF_PY_UHASH_T;
268     }
269     /* add remainder */
270     for (; remainder > 0; remainder--)
271         x = (PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
272     x ^= (Py_uhash_t) len;
273     x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
274     if (x == (Py_uhash_t) -1) {
275         x = (Py_uhash_t) -2;
276     }
277     return x;
278 }
279 
280 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
281                                      16 * SIZEOF_PY_HASH_T};
282 
283 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
284 
285 
286 /* **************************************************************************
287  <MIT License>
288  Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
289 
290  Permission is hereby granted, free of charge, to any person obtaining a copy
291  of this software and associated documentation files (the "Software"), to deal
292  in the Software without restriction, including without limitation the rights
293  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
294  copies of the Software, and to permit persons to whom the Software is
295  furnished to do so, subject to the following conditions:
296 
297  The above copyright notice and this permission notice shall be included in
298  all copies or substantial portions of the Software.
299 
300  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
301  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
302  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
303  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
304  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
305  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
306  THE SOFTWARE.
307  </MIT License>
308 
309  Original location:
310     https://github.com/majek/csiphash/
311 
312  Solution inspired by code from:
313     Samuel Neves (supercop/crypto_auth/siphash24/little)
314     djb (supercop/crypto_auth/siphash24/little2)
315     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
316 
317  Modified for Python by Christian Heimes:
318     - C89 / MSVC compatibility
319     - _rotl64() on Windows
320     - letoh64() fallback
321 */
322 
323 /* byte swap little endian to host endian
324  * Endian conversion not only ensures that the hash function returns the same
325  * value on all platforms. It is also required to for a good dispersion of
326  * the hash values' least significant bits.
327  */
328 #if PY_LITTLE_ENDIAN
329 #  define _le64toh(x) ((uint64_t)(x))
330 #elif defined(__APPLE__)
331 #  define _le64toh(x) OSSwapLittleToHostInt64(x)
332 #elif defined(HAVE_LETOH64)
333 #  define _le64toh(x) le64toh(x)
334 #else
335 #  define _le64toh(x) (((uint64_t)(x) << 56) | \
336                       (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
337                       (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
338                       (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
339                       (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
340                       (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
341                       (((uint64_t)(x) >> 40) & 0xff00ULL) | \
342                       ((uint64_t)(x)  >> 56))
343 #endif
344 
345 
346 #ifdef _MSC_VER
347 #  define ROTATE(x, b)  _rotl64(x, b)
348 #else
349 #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
350 #endif
351 
352 #define HALF_ROUND(a,b,c,d,s,t)     \
353     a += b; c += d;                 \
354     b = ROTATE(b, s) ^ a;           \
355     d = ROTATE(d, t) ^ c;           \
356     a = ROTATE(a, 32);
357 
358 #define SINGLE_ROUND(v0,v1,v2,v3)   \
359     HALF_ROUND(v0,v1,v2,v3,13,16);  \
360     HALF_ROUND(v2,v1,v0,v3,17,21);
361 
362 #define DOUBLE_ROUND(v0,v1,v2,v3)   \
363     SINGLE_ROUND(v0,v1,v2,v3);      \
364     SINGLE_ROUND(v0,v1,v2,v3);
365 
366 
367 static uint64_t
siphash13(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)368 siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
369     uint64_t b = (uint64_t)src_sz << 56;
370     const uint8_t *in = (const uint8_t*)src;
371 
372     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
373     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
374     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
375     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
376 
377     uint64_t t;
378     uint8_t *pt;
379 
380     while (src_sz >= 8) {
381         uint64_t mi;
382         memcpy(&mi, in, sizeof(mi));
383         mi = _le64toh(mi);
384         in += sizeof(mi);
385         src_sz -= sizeof(mi);
386         v3 ^= mi;
387         SINGLE_ROUND(v0,v1,v2,v3);
388         v0 ^= mi;
389     }
390 
391     t = 0;
392     pt = (uint8_t *)&t;
393     switch (src_sz) {
394         case 7: pt[6] = in[6]; /* fall through */
395         case 6: pt[5] = in[5]; /* fall through */
396         case 5: pt[4] = in[4]; /* fall through */
397         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
398         case 3: pt[2] = in[2]; /* fall through */
399         case 2: pt[1] = in[1]; /* fall through */
400         case 1: pt[0] = in[0]; /* fall through */
401     }
402     b |= _le64toh(t);
403 
404     v3 ^= b;
405     SINGLE_ROUND(v0,v1,v2,v3);
406     v0 ^= b;
407     v2 ^= 0xff;
408     SINGLE_ROUND(v0,v1,v2,v3);
409     SINGLE_ROUND(v0,v1,v2,v3);
410     SINGLE_ROUND(v0,v1,v2,v3);
411 
412     /* modified */
413     t = (v0 ^ v1) ^ (v2 ^ v3);
414     return t;
415 }
416 
417 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
418 static uint64_t
siphash24(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)419 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
420     uint64_t b = (uint64_t)src_sz << 56;
421     const uint8_t *in = (const uint8_t*)src;
422 
423     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
424     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
425     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
426     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
427 
428     uint64_t t;
429     uint8_t *pt;
430 
431     while (src_sz >= 8) {
432         uint64_t mi;
433         memcpy(&mi, in, sizeof(mi));
434         mi = _le64toh(mi);
435         in += sizeof(mi);
436         src_sz -= sizeof(mi);
437         v3 ^= mi;
438         DOUBLE_ROUND(v0,v1,v2,v3);
439         v0 ^= mi;
440     }
441 
442     t = 0;
443     pt = (uint8_t *)&t;
444     switch (src_sz) {
445         case 7: pt[6] = in[6]; /* fall through */
446         case 6: pt[5] = in[5]; /* fall through */
447         case 5: pt[4] = in[4]; /* fall through */
448         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
449         case 3: pt[2] = in[2]; /* fall through */
450         case 2: pt[1] = in[1]; /* fall through */
451         case 1: pt[0] = in[0]; /* fall through */
452     }
453     b |= _le64toh(t);
454 
455     v3 ^= b;
456     DOUBLE_ROUND(v0,v1,v2,v3);
457     v0 ^= b;
458     v2 ^= 0xff;
459     DOUBLE_ROUND(v0,v1,v2,v3);
460     DOUBLE_ROUND(v0,v1,v2,v3);
461 
462     /* modified */
463     t = (v0 ^ v1) ^ (v2 ^ v3);
464     return t;
465 }
466 #endif
467 
468 uint64_t
_Py_KeyedHash(uint64_t key,const void * src,Py_ssize_t src_sz)469 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
470 {
471     return siphash13(key, 0, src, src_sz);
472 }
473 
474 
475 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
476 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)477 pysiphash(const void *src, Py_ssize_t src_sz) {
478     return (Py_hash_t)siphash13(
479         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
480         src, src_sz);
481 }
482 
483 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
484 #endif
485 
486 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
487 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)488 pysiphash(const void *src, Py_ssize_t src_sz) {
489     return (Py_hash_t)siphash24(
490         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
491         src, src_sz);
492 }
493 
494 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
495 #endif
496