• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Set of hash utility functions to help maintaining the invariant that
2     if a==b then hash(a)==hash(b)
3 
4    All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7 
8 #ifdef __APPLE__
9 #  include <libkern/OSByteOrder.h>
10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11 #  include <endian.h>
12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13 #  include <sys/endian.h>
14 #endif
15 
16 #ifdef __cplusplus
17 extern "C" {
18 #endif
19 
20 _Py_HashSecret_t _Py_HashSecret = {{0}};
21 
22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23 extern PyHash_FuncDef PyHash_Func;
24 #else
25 static PyHash_FuncDef PyHash_Func;
26 #endif
27 
28 /* Count _Py_HashBytes() calls */
29 #ifdef Py_HASH_STATS
30 #define Py_HASH_STATS_MAX 32
31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32 #endif
33 
34 /* For numeric types, the hash of a number x is based on the reduction
35    of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
36    hash(x) == hash(y) whenever x and y are numerically equal, even if
37    x and y have different types.
38 
39    A quick summary of the hashing strategy:
40 
41    (1) First define the 'reduction of x modulo P' for any rational
42    number x; this is a standard extension of the usual notion of
43    reduction modulo P for integers.  If x == p/q (written in lowest
44    terms), the reduction is interpreted as the reduction of p times
45    the inverse of the reduction of q, all modulo P; if q is exactly
46    divisible by P then define the reduction to be infinity.  So we've
47    got a well-defined map
48 
49       reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50 
51    (2) Now for a rational number x, define hash(x) by:
52 
53       reduce(x)   if x >= 0
54       -reduce(-x) if x < 0
55 
56    If the result of the reduction is infinity (this is impossible for
57    integers, floats and Decimals) then use the predefined hash value
58    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59    _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60    hashes of float and Decimal infinities and nans.
61 
62    A selling point for the above strategy is that it makes it possible
63    to compute hashes of decimal and binary floating-point numbers
64    efficiently, even if the exponent of the binary or decimal number
65    is large.  The key point is that
66 
67       reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
68 
69    provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
70    binary or decimal float is never infinity, since the denominator is a power
71    of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
72    for nonnegative x,
73 
74       reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
75 
76       reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
77 
78    and reduce(10**e) can be computed efficiently by the usual modular
79    exponentiation algorithm.  For reduce(2**e) it's even better: since
80    P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
81    by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
82 
83    */
84 
85 Py_hash_t
_Py_HashDouble(double v)86 _Py_HashDouble(double v)
87 {
88     int e, sign;
89     double m;
90     Py_uhash_t x, y;
91 
92     if (!Py_IS_FINITE(v)) {
93         if (Py_IS_INFINITY(v))
94             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95         else
96             return _PyHASH_NAN;
97     }
98 
99     m = frexp(v, &e);
100 
101     sign = 1;
102     if (m < 0) {
103         sign = -1;
104         m = -m;
105     }
106 
107     /* process 28 bits at a time;  this should work well both for binary
108        and hexadecimal floating point. */
109     x = 0;
110     while (m) {
111         x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
112         m *= 268435456.0;  /* 2**28 */
113         e -= 28;
114         y = (Py_uhash_t)m;  /* pull out integer part */
115         m -= y;
116         x += y;
117         if (x >= _PyHASH_MODULUS)
118             x -= _PyHASH_MODULUS;
119     }
120 
121     /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
122     e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
123     x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
124 
125     x = x * sign;
126     if (x == (Py_uhash_t)-1)
127         x = (Py_uhash_t)-2;
128     return (Py_hash_t)x;
129 }
130 
131 Py_hash_t
_Py_HashPointerRaw(const void * p)132 _Py_HashPointerRaw(const void *p)
133 {
134     size_t y = (size_t)p;
135     /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
136        excessive hash collisions for dicts and sets */
137     y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
138     return (Py_hash_t)y;
139 }
140 
141 Py_hash_t
_Py_HashPointer(const void * p)142 _Py_HashPointer(const void *p)
143 {
144     Py_hash_t x = _Py_HashPointerRaw(p);
145     if (x == -1) {
146         x = -2;
147     }
148     return x;
149 }
150 
151 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)152 _Py_HashBytes(const void *src, Py_ssize_t len)
153 {
154     Py_hash_t x;
155     /*
156       We make the hash of the empty string be 0, rather than using
157       (prefix ^ suffix), since this slightly obfuscates the hash secret
158     */
159     if (len == 0) {
160         return 0;
161     }
162 
163 #ifdef Py_HASH_STATS
164     hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
165 #endif
166 
167 #if Py_HASH_CUTOFF > 0
168     if (len < Py_HASH_CUTOFF) {
169         /* Optimize hashing of very small strings with inline DJBX33A. */
170         Py_uhash_t hash;
171         const unsigned char *p = src;
172         hash = 5381; /* DJBX33A starts with 5381 */
173 
174         switch(len) {
175             /* ((hash << 5) + hash) + *p == hash * 33 + *p */
176             case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
177             case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
178             case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
179             case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
180             case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
181             case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
182             case 1: hash = ((hash << 5) + hash) + *p++; break;
183             default:
184                 Py_UNREACHABLE();
185         }
186         hash ^= len;
187         hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
188         x = (Py_hash_t)hash;
189     }
190     else
191 #endif /* Py_HASH_CUTOFF */
192         x = PyHash_Func.hash(src, len);
193 
194     if (x == -1)
195         return -2;
196     return x;
197 }
198 
199 void
_PyHash_Fini(void)200 _PyHash_Fini(void)
201 {
202 #ifdef Py_HASH_STATS
203     int i;
204     Py_ssize_t total = 0;
205     const char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";
206 
207     fprintf(stderr, "len   calls    total\n");
208     for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
209         total += hashstats[i];
210         fprintf(stderr, fmt, i, hashstats[i], total);
211     }
212     total += hashstats[0];
213     fprintf(stderr, ">  %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
214             hashstats[0], total);
215 #endif
216 }
217 
218 PyHash_FuncDef *
PyHash_GetFuncDef(void)219 PyHash_GetFuncDef(void)
220 {
221     return &PyHash_Func;
222 }
223 
224 /* Optimized memcpy() for Windows */
225 #ifdef _MSC_VER
226 #  if SIZEOF_PY_UHASH_T == 4
227 #    define PY_UHASH_CPY(dst, src) do {                                    \
228        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
229        } while(0)
230 #  elif SIZEOF_PY_UHASH_T == 8
231 #    define PY_UHASH_CPY(dst, src) do {                                    \
232        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
233        dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
234        } while(0)
235 #  else
236 #    error SIZEOF_PY_UHASH_T must be 4 or 8
237 #  endif /* SIZEOF_PY_UHASH_T */
238 #else /* not Windows */
239 #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
240 #endif /* _MSC_VER */
241 
242 
243 #if Py_HASH_ALGORITHM == Py_HASH_FNV
244 /* **************************************************************************
245  * Modified Fowler-Noll-Vo (FNV) hash function
246  */
247 static Py_hash_t
fnv(const void * src,Py_ssize_t len)248 fnv(const void *src, Py_ssize_t len)
249 {
250     const unsigned char *p = src;
251     Py_uhash_t x;
252     Py_ssize_t remainder, blocks;
253     union {
254         Py_uhash_t value;
255         unsigned char bytes[SIZEOF_PY_UHASH_T];
256     } block;
257 
258 #ifdef Py_DEBUG
259     assert(_Py_HashSecret_Initialized);
260 #endif
261     remainder = len % SIZEOF_PY_UHASH_T;
262     if (remainder == 0) {
263         /* Process at least one block byte by byte to reduce hash collisions
264          * for strings with common prefixes. */
265         remainder = SIZEOF_PY_UHASH_T;
266     }
267     blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
268 
269     x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
270     x ^= (Py_uhash_t) *p << 7;
271     while (blocks--) {
272         PY_UHASH_CPY(block.bytes, p);
273         x = (_PyHASH_MULTIPLIER * x) ^ block.value;
274         p += SIZEOF_PY_UHASH_T;
275     }
276     /* add remainder */
277     for (; remainder > 0; remainder--)
278         x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
279     x ^= (Py_uhash_t) len;
280     x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
281     if (x == (Py_uhash_t) -1) {
282         x = (Py_uhash_t) -2;
283     }
284     return x;
285 }
286 
287 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
288                                      16 * SIZEOF_PY_HASH_T};
289 
290 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
291 
292 
293 /* **************************************************************************
294  <MIT License>
295  Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
296 
297  Permission is hereby granted, free of charge, to any person obtaining a copy
298  of this software and associated documentation files (the "Software"), to deal
299  in the Software without restriction, including without limitation the rights
300  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
301  copies of the Software, and to permit persons to whom the Software is
302  furnished to do so, subject to the following conditions:
303 
304  The above copyright notice and this permission notice shall be included in
305  all copies or substantial portions of the Software.
306 
307  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
308  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
309  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
310  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
311  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
312  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
313  THE SOFTWARE.
314  </MIT License>
315 
316  Original location:
317     https://github.com/majek/csiphash/
318 
319  Solution inspired by code from:
320     Samuel Neves (supercop/crypto_auth/siphash24/little)
321     djb (supercop/crypto_auth/siphash24/little2)
322     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
323 
324  Modified for Python by Christian Heimes:
325     - C89 / MSVC compatibility
326     - _rotl64() on Windows
327     - letoh64() fallback
328 */
329 
330 /* byte swap little endian to host endian
331  * Endian conversion not only ensures that the hash function returns the same
332  * value on all platforms. It is also required to for a good dispersion of
333  * the hash values' least significant bits.
334  */
335 #if PY_LITTLE_ENDIAN
336 #  define _le64toh(x) ((uint64_t)(x))
337 #elif defined(__APPLE__)
338 #  define _le64toh(x) OSSwapLittleToHostInt64(x)
339 #elif defined(HAVE_LETOH64)
340 #  define _le64toh(x) le64toh(x)
341 #else
342 #  define _le64toh(x) (((uint64_t)(x) << 56) | \
343                       (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
344                       (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
345                       (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
346                       (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
347                       (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
348                       (((uint64_t)(x) >> 40) & 0xff00ULL) | \
349                       ((uint64_t)(x)  >> 56))
350 #endif
351 
352 
353 #ifdef _MSC_VER
354 #  define ROTATE(x, b)  _rotl64(x, b)
355 #else
356 #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
357 #endif
358 
359 #define HALF_ROUND(a,b,c,d,s,t)         \
360     a += b; c += d;             \
361     b = ROTATE(b, s) ^ a;           \
362     d = ROTATE(d, t) ^ c;           \
363     a = ROTATE(a, 32);
364 
365 #define DOUBLE_ROUND(v0,v1,v2,v3)       \
366     HALF_ROUND(v0,v1,v2,v3,13,16);      \
367     HALF_ROUND(v2,v1,v0,v3,17,21);      \
368     HALF_ROUND(v0,v1,v2,v3,13,16);      \
369     HALF_ROUND(v2,v1,v0,v3,17,21);
370 
371 
372 static uint64_t
siphash24(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)373 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
374     uint64_t b = (uint64_t)src_sz << 56;
375     const uint8_t *in = (const uint8_t*)src;
376 
377     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
378     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
379     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
380     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
381 
382     uint64_t t;
383     uint8_t *pt;
384 
385     while (src_sz >= 8) {
386         uint64_t mi;
387         memcpy(&mi, in, sizeof(mi));
388         mi = _le64toh(mi);
389         in += sizeof(mi);
390         src_sz -= sizeof(mi);
391         v3 ^= mi;
392         DOUBLE_ROUND(v0,v1,v2,v3);
393         v0 ^= mi;
394     }
395 
396     t = 0;
397     pt = (uint8_t *)&t;
398     switch (src_sz) {
399         case 7: pt[6] = in[6]; /* fall through */
400         case 6: pt[5] = in[5]; /* fall through */
401         case 5: pt[4] = in[4]; /* fall through */
402         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
403         case 3: pt[2] = in[2]; /* fall through */
404         case 2: pt[1] = in[1]; /* fall through */
405         case 1: pt[0] = in[0]; /* fall through */
406     }
407     b |= _le64toh(t);
408 
409     v3 ^= b;
410     DOUBLE_ROUND(v0,v1,v2,v3);
411     v0 ^= b;
412     v2 ^= 0xff;
413     DOUBLE_ROUND(v0,v1,v2,v3);
414     DOUBLE_ROUND(v0,v1,v2,v3);
415 
416     /* modified */
417     t = (v0 ^ v1) ^ (v2 ^ v3);
418     return t;
419 }
420 
421 uint64_t
_Py_KeyedHash(uint64_t key,const void * src,Py_ssize_t src_sz)422 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
423 {
424     return siphash24(key, 0, src, src_sz);
425 }
426 
427 
428 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
429 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)430 pysiphash(const void *src, Py_ssize_t src_sz) {
431     return (Py_hash_t)siphash24(
432         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
433         src, src_sz);
434 }
435 
436 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
437 #endif
438 
439 #ifdef __cplusplus
440 }
441 #endif
442