• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Set of hash utility functions to help maintaining the invariant that
2     if a==b then hash(a)==hash(b)
3 
4    All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7 
8 #ifdef __APPLE__
9 #  include <libkern/OSByteOrder.h>
10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11 #  include <endian.h>
12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13 #  include <sys/endian.h>
14 #endif
15 
16 #ifdef __cplusplus
17 extern "C" {
18 #endif
19 
20 _Py_HashSecret_t _Py_HashSecret;
21 
22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23 extern PyHash_FuncDef PyHash_Func;
24 #else
25 static PyHash_FuncDef PyHash_Func;
26 #endif
27 
28 /* Count _Py_HashBytes() calls */
29 #ifdef Py_HASH_STATS
30 #define Py_HASH_STATS_MAX 32
31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32 #endif
33 
34 /* For numeric types, the hash of a number x is based on the reduction
35    of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
36    hash(x) == hash(y) whenever x and y are numerically equal, even if
37    x and y have different types.
38 
39    A quick summary of the hashing strategy:
40 
41    (1) First define the 'reduction of x modulo P' for any rational
42    number x; this is a standard extension of the usual notion of
43    reduction modulo P for integers.  If x == p/q (written in lowest
44    terms), the reduction is interpreted as the reduction of p times
45    the inverse of the reduction of q, all modulo P; if q is exactly
46    divisible by P then define the reduction to be infinity.  So we've
47    got a well-defined map
48 
49       reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50 
51    (2) Now for a rational number x, define hash(x) by:
52 
53       reduce(x)   if x >= 0
54       -reduce(-x) if x < 0
55 
56    If the result of the reduction is infinity (this is impossible for
57    integers, floats and Decimals) then use the predefined hash value
58    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59    _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60    hashes of float and Decimal infinities and nans.
61 
62    A selling point for the above strategy is that it makes it possible
63    to compute hashes of decimal and binary floating-point numbers
64    efficiently, even if the exponent of the binary or decimal number
65    is large.  The key point is that
66 
67       reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
68 
69    provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
70    binary or decimal float is never infinity, since the denominator is a power
71    of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
72    for nonnegative x,
73 
74       reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
75 
76       reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
77 
78    and reduce(10**e) can be computed efficiently by the usual modular
79    exponentiation algorithm.  For reduce(2**e) it's even better: since
80    P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
81    by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
82 
83    */
84 
85 Py_hash_t
_Py_HashDouble(double v)86 _Py_HashDouble(double v)
87 {
88     int e, sign;
89     double m;
90     Py_uhash_t x, y;
91 
92     if (!Py_IS_FINITE(v)) {
93         if (Py_IS_INFINITY(v))
94             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95         else
96             return _PyHASH_NAN;
97     }
98 
99     m = frexp(v, &e);
100 
101     sign = 1;
102     if (m < 0) {
103         sign = -1;
104         m = -m;
105     }
106 
107     /* process 28 bits at a time;  this should work well both for binary
108        and hexadecimal floating point. */
109     x = 0;
110     while (m) {
111         x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
112         m *= 268435456.0;  /* 2**28 */
113         e -= 28;
114         y = (Py_uhash_t)m;  /* pull out integer part */
115         m -= y;
116         x += y;
117         if (x >= _PyHASH_MODULUS)
118             x -= _PyHASH_MODULUS;
119     }
120 
121     /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
122     e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
123     x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
124 
125     x = x * sign;
126     if (x == (Py_uhash_t)-1)
127         x = (Py_uhash_t)-2;
128     return (Py_hash_t)x;
129 }
130 
131 Py_hash_t
_Py_HashPointer(void * p)132 _Py_HashPointer(void *p)
133 {
134     Py_hash_t x;
135     size_t y = (size_t)p;
136     /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
137        excessive hash collisions for dicts and sets */
138     y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
139     x = (Py_hash_t)y;
140     if (x == -1)
141         x = -2;
142     return x;
143 }
144 
145 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)146 _Py_HashBytes(const void *src, Py_ssize_t len)
147 {
148     Py_hash_t x;
149     /*
150       We make the hash of the empty string be 0, rather than using
151       (prefix ^ suffix), since this slightly obfuscates the hash secret
152     */
153     if (len == 0) {
154         return 0;
155     }
156 
157 #ifdef Py_HASH_STATS
158     hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
159 #endif
160 
161 #if Py_HASH_CUTOFF > 0
162     if (len < Py_HASH_CUTOFF) {
163         /* Optimize hashing of very small strings with inline DJBX33A. */
164         Py_uhash_t hash;
165         const unsigned char *p = src;
166         hash = 5381; /* DJBX33A starts with 5381 */
167 
168         switch(len) {
169             /* ((hash << 5) + hash) + *p == hash * 33 + *p */
170             case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
171             case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
172             case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
173             case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
174             case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
175             case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
176             case 1: hash = ((hash << 5) + hash) + *p++; break;
177             default:
178                 assert(0);
179         }
180         hash ^= len;
181         hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
182         x = (Py_hash_t)hash;
183     }
184     else
185 #endif /* Py_HASH_CUTOFF */
186         x = PyHash_Func.hash(src, len);
187 
188     if (x == -1)
189         return -2;
190     return x;
191 }
192 
193 void
_PyHash_Fini(void)194 _PyHash_Fini(void)
195 {
196 #ifdef Py_HASH_STATS
197     int i;
198     Py_ssize_t total = 0;
199     char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";
200 
201     fprintf(stderr, "len   calls    total\n");
202     for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
203         total += hashstats[i];
204         fprintf(stderr, fmt, i, hashstats[i], total);
205     }
206     total += hashstats[0];
207     fprintf(stderr, ">  %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
208             hashstats[0], total);
209 #endif
210 }
211 
212 PyHash_FuncDef *
PyHash_GetFuncDef(void)213 PyHash_GetFuncDef(void)
214 {
215     return &PyHash_Func;
216 }
217 
218 /* Optimized memcpy() for Windows */
219 #ifdef _MSC_VER
220 #  if SIZEOF_PY_UHASH_T == 4
221 #    define PY_UHASH_CPY(dst, src) do {                                    \
222        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
223        } while(0)
224 #  elif SIZEOF_PY_UHASH_T == 8
225 #    define PY_UHASH_CPY(dst, src) do {                                    \
226        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
227        dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
228        } while(0)
229 #  else
230 #    error SIZEOF_PY_UHASH_T must be 4 or 8
231 #  endif /* SIZEOF_PY_UHASH_T */
232 #else /* not Windows */
233 #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
234 #endif /* _MSC_VER */
235 
236 
237 #if Py_HASH_ALGORITHM == Py_HASH_FNV
238 /* **************************************************************************
239  * Modified Fowler-Noll-Vo (FNV) hash function
240  */
241 static Py_hash_t
fnv(const void * src,Py_ssize_t len)242 fnv(const void *src, Py_ssize_t len)
243 {
244     const unsigned char *p = src;
245     Py_uhash_t x;
246     Py_ssize_t remainder, blocks;
247     union {
248         Py_uhash_t value;
249         unsigned char bytes[SIZEOF_PY_UHASH_T];
250     } block;
251 
252 #ifdef Py_DEBUG
253     assert(_Py_HashSecret_Initialized);
254 #endif
255     remainder = len % SIZEOF_PY_UHASH_T;
256     if (remainder == 0) {
257         /* Process at least one block byte by byte to reduce hash collisions
258          * for strings with common prefixes. */
259         remainder = SIZEOF_PY_UHASH_T;
260     }
261     blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
262 
263     x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
264     x ^= (Py_uhash_t) *p << 7;
265     while (blocks--) {
266         PY_UHASH_CPY(block.bytes, p);
267         x = (_PyHASH_MULTIPLIER * x) ^ block.value;
268         p += SIZEOF_PY_UHASH_T;
269     }
270     /* add remainder */
271     for (; remainder > 0; remainder--)
272         x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
273     x ^= (Py_uhash_t) len;
274     x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
275     if (x == -1) {
276         x = -2;
277     }
278     return x;
279 }
280 
281 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
282                                      16 * SIZEOF_PY_HASH_T};
283 
284 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
285 
286 
287 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
288 /* **************************************************************************
289  <MIT License>
290  Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
291 
292  Permission is hereby granted, free of charge, to any person obtaining a copy
293  of this software and associated documentation files (the "Software"), to deal
294  in the Software without restriction, including without limitation the rights
295  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
296  copies of the Software, and to permit persons to whom the Software is
297  furnished to do so, subject to the following conditions:
298 
299  The above copyright notice and this permission notice shall be included in
300  all copies or substantial portions of the Software.
301 
302  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
303  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
304  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
305  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
306  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
307  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
308  THE SOFTWARE.
309  </MIT License>
310 
311  Original location:
312     https://github.com/majek/csiphash/
313 
314  Solution inspired by code from:
315     Samuel Neves (supercop/crypto_auth/siphash24/little)
316     djb (supercop/crypto_auth/siphash24/little2)
317     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
318 
319  Modified for Python by Christian Heimes:
320     - C89 / MSVC compatibility
321     - _rotl64() on Windows
322     - letoh64() fallback
323 */
324 
325 /* byte swap little endian to host endian
326  * Endian conversion not only ensures that the hash function returns the same
327  * value on all platforms. It is also required to for a good dispersion of
328  * the hash values' least significant bits.
329  */
330 #if PY_LITTLE_ENDIAN
331 #  define _le64toh(x) ((uint64_t)(x))
332 #elif defined(__APPLE__)
333 #  define _le64toh(x) OSSwapLittleToHostInt64(x)
334 #elif defined(HAVE_LETOH64)
335 #  define _le64toh(x) le64toh(x)
336 #else
337 #  define _le64toh(x) (((uint64_t)(x) << 56) | \
338                       (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
339                       (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
340                       (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
341                       (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
342                       (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
343                       (((uint64_t)(x) >> 40) & 0xff00ULL) | \
344                       ((uint64_t)(x)  >> 56))
345 #endif
346 
347 
348 #ifdef _MSC_VER
349 #  define ROTATE(x, b)  _rotl64(x, b)
350 #else
351 #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
352 #endif
353 
354 #define HALF_ROUND(a,b,c,d,s,t)         \
355     a += b; c += d;             \
356     b = ROTATE(b, s) ^ a;           \
357     d = ROTATE(d, t) ^ c;           \
358     a = ROTATE(a, 32);
359 
360 #define DOUBLE_ROUND(v0,v1,v2,v3)       \
361     HALF_ROUND(v0,v1,v2,v3,13,16);      \
362     HALF_ROUND(v2,v1,v0,v3,17,21);      \
363     HALF_ROUND(v0,v1,v2,v3,13,16);      \
364     HALF_ROUND(v2,v1,v0,v3,17,21);
365 
366 
367 static Py_hash_t
siphash24(const void * src,Py_ssize_t src_sz)368 siphash24(const void *src, Py_ssize_t src_sz) {
369     uint64_t k0 = _le64toh(_Py_HashSecret.siphash.k0);
370     uint64_t k1 = _le64toh(_Py_HashSecret.siphash.k1);
371     uint64_t b = (uint64_t)src_sz << 56;
372     const uint64_t *in = (uint64_t*)src;
373 
374     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
375     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
376     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
377     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
378 
379     uint64_t t;
380     uint8_t *pt;
381     uint8_t *m;
382 
383     while (src_sz >= 8) {
384         uint64_t mi = _le64toh(*in);
385         in += 1;
386         src_sz -= 8;
387         v3 ^= mi;
388         DOUBLE_ROUND(v0,v1,v2,v3);
389         v0 ^= mi;
390     }
391 
392     t = 0;
393     pt = (uint8_t *)&t;
394     m = (uint8_t *)in;
395     switch (src_sz) {
396         case 7: pt[6] = m[6];
397         case 6: pt[5] = m[5];
398         case 5: pt[4] = m[4];
399         case 4: memcpy(pt, m, sizeof(uint32_t)); break;
400         case 3: pt[2] = m[2];
401         case 2: pt[1] = m[1];
402         case 1: pt[0] = m[0];
403     }
404     b |= _le64toh(t);
405 
406     v3 ^= b;
407     DOUBLE_ROUND(v0,v1,v2,v3);
408     v0 ^= b;
409     v2 ^= 0xff;
410     DOUBLE_ROUND(v0,v1,v2,v3);
411     DOUBLE_ROUND(v0,v1,v2,v3);
412 
413     /* modified */
414     t = (v0 ^ v1) ^ (v2 ^ v3);
415     return (Py_hash_t)t;
416 }
417 
418 static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128};
419 
420 #endif /* Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 */
421 
422 #ifdef __cplusplus
423 }
424 #endif
425