• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Set of hash utility functions to help maintaining the invariant that
2     if a==b then hash(a)==hash(b)
3 
4    All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7 
8 #ifdef __APPLE__
9 #  include <libkern/OSByteOrder.h>
10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11 #  include <endian.h>
12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13 #  include <sys/endian.h>
14 #endif
15 
16 #ifdef __cplusplus
17 extern "C" {
18 #endif
19 
20 _Py_HashSecret_t _Py_HashSecret = {{0}};
21 
22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23 extern PyHash_FuncDef PyHash_Func;
24 #else
25 static PyHash_FuncDef PyHash_Func;
26 #endif
27 
28 /* Count _Py_HashBytes() calls */
29 #ifdef Py_HASH_STATS
30 #define Py_HASH_STATS_MAX 32
31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32 #endif
33 
34 /* For numeric types, the hash of a number x is based on the reduction
35    of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
36    hash(x) == hash(y) whenever x and y are numerically equal, even if
37    x and y have different types.
38 
39    A quick summary of the hashing strategy:
40 
41    (1) First define the 'reduction of x modulo P' for any rational
42    number x; this is a standard extension of the usual notion of
43    reduction modulo P for integers.  If x == p/q (written in lowest
44    terms), the reduction is interpreted as the reduction of p times
45    the inverse of the reduction of q, all modulo P; if q is exactly
46    divisible by P then define the reduction to be infinity.  So we've
47    got a well-defined map
48 
49       reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50 
51    (2) Now for a rational number x, define hash(x) by:
52 
53       reduce(x)   if x >= 0
54       -reduce(-x) if x < 0
55 
56    If the result of the reduction is infinity (this is impossible for
57    integers, floats and Decimals) then use the predefined hash value
58    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59    _PyHASH_INF and -_PyHASH_INF are also used for the
60    hashes of float and Decimal infinities.
61 
62    NaNs hash with a pointer hash.  Having distinct hash values prevents
63    catastrophic pileups from distinct NaN instances which used to always
64    have the same hash value but would compare unequal.
65 
66    A selling point for the above strategy is that it makes it possible
67    to compute hashes of decimal and binary floating-point numbers
68    efficiently, even if the exponent of the binary or decimal number
69    is large.  The key point is that
70 
71       reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
72 
73    provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
74    binary or decimal float is never infinity, since the denominator is a power
75    of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
76    for nonnegative x,
77 
78       reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
79 
80       reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
81 
82    and reduce(10**e) can be computed efficiently by the usual modular
83    exponentiation algorithm.  For reduce(2**e) it's even better: since
84    P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
85    by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
86 
87    */
88 
89 Py_hash_t _Py_HashPointer(const void *);
90 
91 Py_hash_t
_Py_HashDouble(PyObject * inst,double v)92 _Py_HashDouble(PyObject *inst, double v)
93 {
94     int e, sign;
95     double m;
96     Py_uhash_t x, y;
97 
98     if (!Py_IS_FINITE(v)) {
99         if (Py_IS_INFINITY(v))
100             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
101         else
102             return _Py_HashPointer(inst);
103     }
104 
105     m = frexp(v, &e);
106 
107     sign = 1;
108     if (m < 0) {
109         sign = -1;
110         m = -m;
111     }
112 
113     /* process 28 bits at a time;  this should work well both for binary
114        and hexadecimal floating point. */
115     x = 0;
116     while (m) {
117         x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
118         m *= 268435456.0;  /* 2**28 */
119         e -= 28;
120         y = (Py_uhash_t)m;  /* pull out integer part */
121         m -= y;
122         x += y;
123         if (x >= _PyHASH_MODULUS)
124             x -= _PyHASH_MODULUS;
125     }
126 
127     /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
128     e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
129     x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
130 
131     x = x * sign;
132     if (x == (Py_uhash_t)-1)
133         x = (Py_uhash_t)-2;
134     return (Py_hash_t)x;
135 }
136 
137 Py_hash_t
_Py_HashPointerRaw(const void * p)138 _Py_HashPointerRaw(const void *p)
139 {
140     size_t y = (size_t)p;
141     /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
142        excessive hash collisions for dicts and sets */
143     y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
144     return (Py_hash_t)y;
145 }
146 
147 Py_hash_t
_Py_HashPointer(const void * p)148 _Py_HashPointer(const void *p)
149 {
150     Py_hash_t x = _Py_HashPointerRaw(p);
151     if (x == -1) {
152         x = -2;
153     }
154     return x;
155 }
156 
157 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)158 _Py_HashBytes(const void *src, Py_ssize_t len)
159 {
160     Py_hash_t x;
161     /*
162       We make the hash of the empty string be 0, rather than using
163       (prefix ^ suffix), since this slightly obfuscates the hash secret
164     */
165     if (len == 0) {
166         return 0;
167     }
168 
169 #ifdef Py_HASH_STATS
170     hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
171 #endif
172 
173 #if Py_HASH_CUTOFF > 0
174     if (len < Py_HASH_CUTOFF) {
175         /* Optimize hashing of very small strings with inline DJBX33A. */
176         Py_uhash_t hash;
177         const unsigned char *p = src;
178         hash = 5381; /* DJBX33A starts with 5381 */
179 
180         switch(len) {
181             /* ((hash << 5) + hash) + *p == hash * 33 + *p */
182             case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
183             case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
184             case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
185             case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
186             case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
187             case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
188             case 1: hash = ((hash << 5) + hash) + *p++; break;
189             default:
190                 Py_UNREACHABLE();
191         }
192         hash ^= len;
193         hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
194         x = (Py_hash_t)hash;
195     }
196     else
197 #endif /* Py_HASH_CUTOFF */
198         x = PyHash_Func.hash(src, len);
199 
200     if (x == -1)
201         return -2;
202     return x;
203 }
204 
205 void
_PyHash_Fini(void)206 _PyHash_Fini(void)
207 {
208 #ifdef Py_HASH_STATS
209     fprintf(stderr, "len   calls    total\n");
210     Py_ssize_t total = 0;
211     for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
212         total += hashstats[i];
213         fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
214     }
215     total += hashstats[0];
216     fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
217 #endif
218 }
219 
220 PyHash_FuncDef *
PyHash_GetFuncDef(void)221 PyHash_GetFuncDef(void)
222 {
223     return &PyHash_Func;
224 }
225 
226 /* Optimized memcpy() for Windows */
227 #ifdef _MSC_VER
228 #  if SIZEOF_PY_UHASH_T == 4
229 #    define PY_UHASH_CPY(dst, src) do {                                    \
230        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
231        } while(0)
232 #  elif SIZEOF_PY_UHASH_T == 8
233 #    define PY_UHASH_CPY(dst, src) do {                                    \
234        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
235        dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
236        } while(0)
237 #  else
238 #    error SIZEOF_PY_UHASH_T must be 4 or 8
239 #  endif /* SIZEOF_PY_UHASH_T */
240 #else /* not Windows */
241 #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
242 #endif /* _MSC_VER */
243 
244 
245 #if Py_HASH_ALGORITHM == Py_HASH_FNV
246 /* **************************************************************************
247  * Modified Fowler-Noll-Vo (FNV) hash function
248  */
249 static Py_hash_t
fnv(const void * src,Py_ssize_t len)250 fnv(const void *src, Py_ssize_t len)
251 {
252     const unsigned char *p = src;
253     Py_uhash_t x;
254     Py_ssize_t remainder, blocks;
255     union {
256         Py_uhash_t value;
257         unsigned char bytes[SIZEOF_PY_UHASH_T];
258     } block;
259 
260 #ifdef Py_DEBUG
261     assert(_Py_HashSecret_Initialized);
262 #endif
263     remainder = len % SIZEOF_PY_UHASH_T;
264     if (remainder == 0) {
265         /* Process at least one block byte by byte to reduce hash collisions
266          * for strings with common prefixes. */
267         remainder = SIZEOF_PY_UHASH_T;
268     }
269     blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
270 
271     x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
272     x ^= (Py_uhash_t) *p << 7;
273     while (blocks--) {
274         PY_UHASH_CPY(block.bytes, p);
275         x = (_PyHASH_MULTIPLIER * x) ^ block.value;
276         p += SIZEOF_PY_UHASH_T;
277     }
278     /* add remainder */
279     for (; remainder > 0; remainder--)
280         x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
281     x ^= (Py_uhash_t) len;
282     x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
283     if (x == (Py_uhash_t) -1) {
284         x = (Py_uhash_t) -2;
285     }
286     return x;
287 }
288 
289 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
290                                      16 * SIZEOF_PY_HASH_T};
291 
292 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
293 
294 
295 /* **************************************************************************
296  <MIT License>
297  Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
298 
299  Permission is hereby granted, free of charge, to any person obtaining a copy
300  of this software and associated documentation files (the "Software"), to deal
301  in the Software without restriction, including without limitation the rights
302  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
303  copies of the Software, and to permit persons to whom the Software is
304  furnished to do so, subject to the following conditions:
305 
306  The above copyright notice and this permission notice shall be included in
307  all copies or substantial portions of the Software.
308 
309  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
310  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
311  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
312  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
313  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
314  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
315  THE SOFTWARE.
316  </MIT License>
317 
318  Original location:
319     https://github.com/majek/csiphash/
320 
321  Solution inspired by code from:
322     Samuel Neves (supercop/crypto_auth/siphash24/little)
323     djb (supercop/crypto_auth/siphash24/little2)
324     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
325 
326  Modified for Python by Christian Heimes:
327     - C89 / MSVC compatibility
328     - _rotl64() on Windows
329     - letoh64() fallback
330 */
331 
332 /* byte swap little endian to host endian
333  * Endian conversion not only ensures that the hash function returns the same
334  * value on all platforms. It is also required to for a good dispersion of
335  * the hash values' least significant bits.
336  */
337 #if PY_LITTLE_ENDIAN
338 #  define _le64toh(x) ((uint64_t)(x))
339 #elif defined(__APPLE__)
340 #  define _le64toh(x) OSSwapLittleToHostInt64(x)
341 #elif defined(HAVE_LETOH64)
342 #  define _le64toh(x) le64toh(x)
343 #else
344 #  define _le64toh(x) (((uint64_t)(x) << 56) | \
345                       (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
346                       (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
347                       (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
348                       (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
349                       (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
350                       (((uint64_t)(x) >> 40) & 0xff00ULL) | \
351                       ((uint64_t)(x)  >> 56))
352 #endif
353 
354 
355 #ifdef _MSC_VER
356 #  define ROTATE(x, b)  _rotl64(x, b)
357 #else
358 #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
359 #endif
360 
361 #define HALF_ROUND(a,b,c,d,s,t)         \
362     a += b; c += d;             \
363     b = ROTATE(b, s) ^ a;           \
364     d = ROTATE(d, t) ^ c;           \
365     a = ROTATE(a, 32);
366 
367 #define DOUBLE_ROUND(v0,v1,v2,v3)       \
368     HALF_ROUND(v0,v1,v2,v3,13,16);      \
369     HALF_ROUND(v2,v1,v0,v3,17,21);      \
370     HALF_ROUND(v0,v1,v2,v3,13,16);      \
371     HALF_ROUND(v2,v1,v0,v3,17,21);
372 
373 
374 static uint64_t
siphash24(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)375 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
376     uint64_t b = (uint64_t)src_sz << 56;
377     const uint8_t *in = (const uint8_t*)src;
378 
379     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
380     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
381     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
382     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
383 
384     uint64_t t;
385     uint8_t *pt;
386 
387     while (src_sz >= 8) {
388         uint64_t mi;
389         memcpy(&mi, in, sizeof(mi));
390         mi = _le64toh(mi);
391         in += sizeof(mi);
392         src_sz -= sizeof(mi);
393         v3 ^= mi;
394         DOUBLE_ROUND(v0,v1,v2,v3);
395         v0 ^= mi;
396     }
397 
398     t = 0;
399     pt = (uint8_t *)&t;
400     switch (src_sz) {
401         case 7: pt[6] = in[6]; /* fall through */
402         case 6: pt[5] = in[5]; /* fall through */
403         case 5: pt[4] = in[4]; /* fall through */
404         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
405         case 3: pt[2] = in[2]; /* fall through */
406         case 2: pt[1] = in[1]; /* fall through */
407         case 1: pt[0] = in[0]; /* fall through */
408     }
409     b |= _le64toh(t);
410 
411     v3 ^= b;
412     DOUBLE_ROUND(v0,v1,v2,v3);
413     v0 ^= b;
414     v2 ^= 0xff;
415     DOUBLE_ROUND(v0,v1,v2,v3);
416     DOUBLE_ROUND(v0,v1,v2,v3);
417 
418     /* modified */
419     t = (v0 ^ v1) ^ (v2 ^ v3);
420     return t;
421 }
422 
423 uint64_t
_Py_KeyedHash(uint64_t key,const void * src,Py_ssize_t src_sz)424 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
425 {
426     return siphash24(key, 0, src, src_sz);
427 }
428 
429 
430 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
431 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)432 pysiphash(const void *src, Py_ssize_t src_sz) {
433     return (Py_hash_t)siphash24(
434         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
435         src, src_sz);
436 }
437 
438 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
439 #endif
440 
441 #ifdef __cplusplus
442 }
443 #endif
444