• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Derived from: https://github.com/ilanschnell/perfect-hash
2# Commit: 6b7dd80a525dbd4349ea2c69f04a9c96f3c2fd54
3
4# BSD 3-Clause License
5#
6# Copyright (c) 2019 - 2021, Ilan Schnell
7# All rights reserved.
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions are met:
11#     * Redistributions of source code must retain the above copyright
12#       notice, this list of conditions and the following disclaimer.
13#     * Redistributions in binary form must reproduce the above copyright
14#       notice, this list of conditions and the following disclaimer in the
15#       documentation and/or other materials provided with the distribution.
16#     * Neither the name of the Ilan Schnell nor the
17#       names of its contributors may be used to endorse or promote products
18#       derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23# DISCLAIMED. IN NO EVENT SHALL ILAN SCHNELL BE LIABLE FOR ANY
24# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""
32Generate a minimal perfect hash function for the keys in a file,
33desired hash values may be specified within this file as well.
34A given code template is filled with parameters, such that the
35output is code which implements the hash function.
36Templates can easily be constructed for any programming language.
37
38The code is based on an a program A.M. Kuchling wrote:
39http://www.amk.ca/python/code/perfect-hash
40
41The algorithm the program uses is described in the paper
42'Optimal algorithms for minimal perfect hashing',
43Z. J. Czech, G. Havas and B.S. Majewski.
44http://citeseer.ist.psu.edu/122364.html
45
46The algorithm works like this:
47
481.  You have K keys, that you want to perfectly hash against some
49    desired hash values.
50
512.  Choose a number N larger than K.  This is the number of
52    vertices in a graph G, and also the size of the resulting table G.
53
543.  Pick two random hash functions f1, f2, that return values from 0..N-1.
55
564.  Now, for all keys, you draw an edge between vertices f1(key) and f2(key)
57    of the graph G, and associate the desired hash value with that edge.
58
595.  If G is cyclic, go back to step 2.
60
616.  Assign values to each vertex such that, for each edge, you can add
62    the values for the two vertices and get the desired (hash) value
63    for that edge.  This task is easy, because the graph is acyclic.
64    This is done by picking a vertex, and assigning it a value of 0.
65    Then do a depth-first search, assigning values to new vertices so that
66    they sum up properly.
67
687.  f1, f2, and vertex values of G now make up a perfect hash function.
69
70
71For simplicity, the implementation of the algorithm combines steps 5 and 6.
72That is, we check for loops in G and assign the vertex values in one procedure.
73If this procedure succeeds, G is acyclic and the vertex values are assigned.
74If the procedure fails, G is cyclic, and we go back to step 2, replacing G
75with a new graph, and thereby discarding the vertex values from the failed
76attempt.
77"""
78from __future__ import absolute_import, division, print_function
79
80import sys
81import random
82import string
83import subprocess
84import shutil
85import tempfile
86from collections import defaultdict
87from os.path import join
88
89if sys.version_info[0] == 2:
90    from cStringIO import StringIO
91else:
92    from io import StringIO
93
94
95__version__ = '0.4.2'
96
97
98verbose = False
99trials = 150
100
101
102class Graph(object):
103    """
104    Implements a graph with 'N' vertices.  First, you connect the graph with
105    edges, which have a desired value associated.  Then the vertex values
106    are assigned, which will fail if the graph is cyclic.  The vertex values
107    are assigned such that the two values corresponding to an edge add up to
108    the desired edge value (mod N).
109    """
110    def __init__(self, N):
111        self.N = N                     # number of vertices
112
113        # maps a vertex number to the list of tuples (vertex, edge value)
114        # to which it is connected by edges.
115        self.adjacent = defaultdict(list)
116
117    def connect(self, vertex1, vertex2, edge_value):
118        """
119        Connect 'vertex1' and 'vertex2' with an edge, with associated
120        value 'value'
121        """
122        # Add vertices to each other's adjacent list
123        self.adjacent[vertex1].append((vertex2, edge_value))
124        self.adjacent[vertex2].append((vertex1, edge_value))
125
126    def assign_vertex_values(self):
127        """
128        Try to assign the vertex values, such that, for each edge, you can
129        add the values for the two vertices involved and get the desired
130        value for that edge, i.e. the desired hash key.
131        This will fail when the graph is cyclic.
132
133        This is done by a Depth-First Search of the graph.  If the search
134        finds a vertex that was visited before, there's a loop and False is
135        returned immediately, i.e. the assignment is terminated.
136        On success (when the graph is acyclic) True is returned.
137        """
138        self.vertex_values = self.N * [-1]  # -1 means unassigned
139
140        visited = self.N * [False]
141
142        # Loop over all vertices, taking unvisited ones as roots.
143        for root in range(self.N):
144            if visited[root]:
145                continue
146
147            # explore tree starting at 'root'
148            self.vertex_values[root] = 0    # set arbitrarily to zero
149
150            # Stack of vertices to visit, a list of tuples (parent, vertex)
151            tovisit = [(None, root)]
152            while tovisit:
153                parent, vertex = tovisit.pop()
154                visited[vertex] = True
155
156                # Loop over adjacent vertices, but skip the vertex we arrived
157                # here from the first time it is encountered.
158                skip = True
159                for neighbor, edge_value in self.adjacent[vertex]:
160                    if skip and neighbor == parent:
161                        skip = False
162                        continue
163
164                    if visited[neighbor]:
165                        # We visited here before, so the graph is cyclic.
166                        return False
167
168                    tovisit.append((vertex, neighbor))
169
170                    # Set new vertex's value to the desired edge value,
171                    # minus the value of the vertex we came here from.
172                    self.vertex_values[neighbor] = (
173                        edge_value - self.vertex_values[vertex]) % self.N
174
175        # check if all vertices have a valid value
176        for vertex in range(self.N):
177            assert self.vertex_values[vertex] >= 0
178
179        # We got though, so the graph is acyclic,
180        # and all values are now assigned.
181        return True
182
183
184class StrSaltHash(object):
185    """
186    Random hash function generator.
187    Simple byte level hashing: each byte is multiplied to another byte from
188    a random string of characters, summed up, and finally modulo NG is
189    taken.
190    """
191    chars = string.ascii_letters + string.digits
192
193    def __init__(self, N):
194        self.N = N
195        self.salt = ''
196
197    def __call__(self, key):
198        # XXX: xkbcommon modification: make the salt length a power of 2
199        #      so that the % operation in the hash is fast.
200        while len(self.salt) < max(len(key), 32): # add more salt as necessary
201            self.salt += random.choice(self.chars)
202
203        return sum(ord(self.salt[i]) * ord(c)
204                   for i, c in enumerate(key)) % self.N
205
206    template = """
207def hash_f(key, T):
208    return sum(ord(T[i % $NS]) * ord(c) for i, c in enumerate(key)) % $NG
209
210def perfect_hash(key):
211    return (G[hash_f(key, "$S1")] +
212            G[hash_f(key, "$S2")]) % $NG
213"""
214
215class IntSaltHash(object):
216    """
217    Random hash function generator.
218    Simple byte level hashing, each byte is multiplied in sequence to a table
219    containing random numbers, summed tp, and finally modulo NG is taken.
220    """
221    def __init__(self, N):
222        self.N = N
223        self.salt = []
224
225    def __call__(self, key):
226        while len(self.salt) < len(key): # add more salt as necessary
227            self.salt.append(random.randint(1, self.N - 1))
228
229        return sum(self.salt[i] * ord(c)
230                   for i, c in enumerate(key)) % self.N
231
232    template = """
233S1 = [$S1]
234S2 = [$S2]
235assert len(S1) == len(S2) == $NS
236
237def hash_f(key, T):
238    return sum(T[i % $NS] * ord(c) for i, c in enumerate(key)) % $NG
239
240def perfect_hash(key):
241    return (G[hash_f(key, S1)] + G[hash_f(key, S2)]) % $NG
242"""
243
244def builtin_template(Hash):
245    return """\
246# =======================================================================
247# ================= Python code for perfect hash function ===============
248# =======================================================================
249
250G = [$G]
251""" + Hash.template + """
252# ============================ Sanity check =============================
253
254K = [$K]
255assert len(K) == $NK
256
257for h, k in enumerate(K):
258    assert perfect_hash(k) == h
259"""
260
261
262class TooManyInterationsError(Exception):
263    pass
264
265
266def generate_hash(keys, Hash=StrSaltHash):
267    """
268    Return hash functions f1 and f2, and G for a perfect minimal hash.
269    Input is an iterable of 'keys', whos indicies are the desired hash values.
270    'Hash' is a random hash function generator, that means Hash(N) returns a
271    returns a random hash function which returns hash values from 0..N-1.
272    """
273    if not isinstance(keys, (list, tuple)):
274        raise TypeError("list or tuple expected")
275    NK = len(keys)
276    if NK != len(set(keys)):
277        raise ValueError("duplicate keys")
278    for key in keys:
279        if not isinstance(key, str):
280            raise TypeError("key a not string: %r" % key)
281    if NK > 10000 and Hash == StrSaltHash:
282        print("""\
283WARNING: You have %d keys.
284         Using --hft=1 is likely to fail for so many keys.
285         Please use --hft=2 instead.
286""" % NK)
287
288    # the number of vertices in the graph G
289    NG = NK + 1
290    if verbose:
291        print('NG = %d' % NG)
292
293    trial = 0  # Number of trial graphs so far
294    while True:
295        if (trial % trials) == 0:   # trials failures, increase NG slightly
296            if trial > 0:
297                NG = max(NG + 1, int(1.05 * NG))
298            if verbose:
299                sys.stdout.write('\nGenerating graphs NG = %d ' % NG)
300        trial += 1
301
302        if NG > 100 * (NK + 1):
303            raise TooManyInterationsError("%d keys" % NK)
304
305        if verbose:
306            sys.stdout.write('.')
307            sys.stdout.flush()
308
309        G = Graph(NG)   # Create graph with NG vertices
310        f1 = Hash(NG)   # Create 2 random hash functions
311        f2 = Hash(NG)
312
313        # Connect vertices given by the values of the two hash functions
314        # for each key.  Associate the desired hash value with each edge.
315        for hashval, key in enumerate(keys):
316            G.connect(f1(key), f2(key), hashval)
317
318        # Try to assign the vertex values.  This will fail when the graph
319        # is cyclic.  But when the graph is acyclic it will succeed and we
320        # break out, because we're done.
321        if G.assign_vertex_values():
322            break
323
324    if verbose:
325        print('\nAcyclic graph found after %d trials.' % trial)
326        print('NG = %d' % NG)
327
328    # Sanity check the result by actually verifying that all the keys
329    # hash to the right value.
330    for hashval, key in enumerate(keys):
331        assert hashval == (
332            G.vertex_values[f1(key)] + G.vertex_values[f2(key)]
333        ) % NG
334
335    if verbose:
336        print('OK')
337
338    return f1, f2, G.vertex_values
339
340
341class Format(object):
342
343    def __init__(self, width=76, indent=4, delimiter=', '):
344        self.width = width
345        self.indent = indent
346        self.delimiter = delimiter
347
348    def print_format(self):
349        print("Format options:")
350        for name in 'width', 'indent', 'delimiter':
351            print('  %s: %r' % (name, getattr(self, name)))
352
353    def __call__(self, data, quote=False):
354        if not isinstance(data, (list, tuple)):
355            return str(data)
356
357        lendel = len(self.delimiter)
358        aux = StringIO()
359        pos = 20
360        for i, elt in enumerate(data):
361            last = bool(i == len(data) - 1)
362
363            s = ('"%s"' if quote else '%s') % elt
364
365            if pos + len(s) + lendel > self.width:
366                aux.write('\n' + (self.indent * ' '))
367                pos = self.indent
368
369            aux.write(s)
370            pos += len(s)
371            if not last:
372                aux.write(self.delimiter)
373                pos += lendel
374
375        return '\n'.join(l.rstrip() for l in aux.getvalue().split('\n'))
376
377
378def generate_code(keys, Hash=StrSaltHash, template=None, options=None):
379    """
380    Takes a list of key value pairs and inserts the generated parameter
381    lists into the 'template' string.  'Hash' is the random hash function
382    generator, and the optional keywords are formating options.
383    The return value is the substituted code template.
384    """
385    f1, f2, G = generate_hash(keys, Hash)
386
387    assert f1.N == f2.N == len(G)
388    try:
389        salt_len = len(f1.salt)
390        assert salt_len == len(f2.salt)
391    except TypeError:
392        salt_len = None
393
394    if template is None:
395        template = builtin_template(Hash)
396
397    if options is None:
398        fmt = Format()
399    else:
400        fmt = Format(width=options.width, indent=options.indent,
401                     delimiter=options.delimiter)
402
403    if verbose:
404        fmt.print_format()
405
406    return string.Template(template).substitute(
407        NS = salt_len,
408        S1 = fmt(f1.salt),
409        S2 = fmt(f2.salt),
410        NG = len(G),
411        G  = fmt(G),
412        NK = len(keys),
413        K  = fmt(list(keys), quote=True))
414
415
416def read_table(filename, options):
417    """
418    Reads keys and desired hash value pairs from a file.  If no column
419    for the hash value is specified, a sequence of hash values is generated,
420    from 0 to N-1, where N is the number of rows found in the file.
421    """
422    if verbose:
423        print("Reading table from file `%s' to extract keys." % filename)
424    try:
425        fi = open(filename)
426    except IOError:
427        sys.exit("Error: Could not open `%s' for reading." % filename)
428
429    keys = []
430
431    if verbose:
432        print("Reader options:")
433        for name in 'comment', 'splitby', 'keycol':
434            print('  %s: %r' % (name, getattr(options, name)))
435
436    for n, line in enumerate(fi):
437        line = line.strip()
438        if not line or line.startswith(options.comment):
439            continue
440
441        if line.count(options.comment): # strip content after comment
442            line = line.split(options.comment)[0].strip()
443
444        row = [col.strip() for col in line.split(options.splitby)]
445
446        try:
447            key = row[options.keycol - 1]
448        except IndexError:
449            sys.exit("%s:%d: Error: Cannot read key, not enough columns." %
450                     (filename, n + 1))
451
452        keys.append(key)
453
454    fi.close()
455
456    if not keys:
457        exit("Error: no keys found in file `%s'." % filename)
458
459    return keys
460
461
462def read_template(filename):
463    if verbose:
464        print("Reading template from file `%s'" % filename)
465    try:
466        with open(filename, 'r') as fi:
467            return fi.read()
468    except IOError:
469        sys.exit("Error: Could not open `%s' for reading." % filename)
470
471
472def run_code(code):
473    tmpdir = tempfile.mkdtemp()
474    path = join(tmpdir, 't.py')
475    with open(path, 'w') as fo:
476        fo.write(code)
477    try:
478        subprocess.check_call([sys.executable, path])
479    except subprocess.CalledProcessError as e:
480        raise AssertionError(e)
481    finally:
482        shutil.rmtree(tmpdir)
483
484
485def main():
486    from optparse import OptionParser
487
488    usage = "usage: %prog [options] KEYS_FILE [TMPL_FILE]"
489
490    description = """\
491Generates code for perfect hash functions from
492a file with keywords and a code template.
493If no template file is provided, a small built-in Python template
494is processed and the output code is written to stdout.
495"""
496
497    parser = OptionParser(usage = usage,
498                          description = description,
499                          prog = sys.argv[0],
500                          version = "%prog: " + __version__)
501
502    parser.add_option("--delimiter",
503                      action  = "store",
504                      default = ", ",
505                      help    = "Delimiter for list items used in output, "
506                                "the default delimiter is '%default'",
507                      metavar = "STR")
508
509    parser.add_option("--indent",
510                      action  = "store",
511                      default = 4,
512                      type    = "int",
513                      help    = "Make INT spaces at the beginning of a "
514                                "new line when generated list is wrapped. "
515                                "Default is %default",
516                      metavar = "INT")
517
518    parser.add_option("--width",
519                      action  = "store",
520                      default = 76,
521                      type    = "int",
522                      help    = "Maximal width of generated list when "
523                                "wrapped.  Default width is %default",
524                      metavar = "INT")
525
526    parser.add_option("--comment",
527                      action  = "store",
528                      default = "#",
529                      help    = "STR is the character, or sequence of "
530                                "characters, which marks the beginning "
531                                "of a comment (which runs till "
532                                "the end of the line), in the input "
533                                "KEYS_FILE. "
534                                "Default is '%default'",
535                      metavar = "STR")
536
537    parser.add_option("--splitby",
538                      action  = "store",
539                      default = ",",
540                      help    = "STR is the character by which the columns "
541                                "in the input KEYS_FILE are split. "
542                                "Default is '%default'",
543                      metavar = "STR")
544
545    parser.add_option("--keycol",
546                      action  = "store",
547                      default = 1,
548                      type    = "int",
549                      help    = "Specifies the column INT in the input "
550                                "KEYS_FILE which contains the keys. "
551                                "Default is %default, i.e. the first column.",
552                      metavar = "INT")
553
554    parser.add_option("--trials",
555                      action  = "store",
556                      default = 5,
557                      type    = "int",
558                      help    = "Specifies the number of trials before "
559                                "NG is increased.  A small INT will give "
560                                "compute faster, but the array G will be "
561                                "large.  A large INT will take longer to "
562                                "compute but G will be smaller. "
563                                "Default is %default",
564                      metavar = "INT")
565
566    parser.add_option("--hft",
567                      action  = "store",
568                      default = 1,
569                      type    = "int",
570                      help    = "Hash function type INT.  Possible values "
571                                "are 1 (StrSaltHash) and 2 (IntSaltHash). "
572                                "The default is %default",
573                      metavar = "INT")
574
575    parser.add_option("-e", "--execute",
576                      action  = "store_true",
577                      help    = "Execute the generated code within "
578                                "the Python interpreter.")
579
580    parser.add_option("-o", "--output",
581                      action  = "store",
582                      help    = "Specify output FILE explicitly. "
583                                "`-o std' means standard output. "
584                                "`-o no' means no output. "
585                                "By default, the file name is obtained "
586                                "from the name of the template file by "
587                                "substituting `tmpl' to `code'.",
588                      metavar = "FILE")
589
590    parser.add_option("-v", "--verbose",
591                      action = "store_true",
592                      help = "verbosity")
593
594    options, args = parser.parse_args()
595
596    if options.trials <= 0:
597        parser.error("trials before increasing N has to be larger than zero")
598
599    global trials
600    trials = options.trials
601
602    global verbose
603    verbose = options.verbose
604
605    if len(args) not in (1, 2):
606        parser.error("incorrect number of arguments")
607
608    if len(args) == 2 and not args[1].count('tmpl'):
609        parser.error("template filename does not contain 'tmpl'")
610
611    if options.hft == 1:
612        Hash = StrSaltHash
613    elif options.hft == 2:
614        Hash = IntSaltHash
615    else:
616        parser.error("Hash function %s not implemented." % options.hft)
617
618    # --------------------- end parsing and checking --------------
619
620    keys_file = args[0]
621
622    if verbose:
623        print("keys_file = %r" % keys_file)
624
625    keys = read_table(keys_file, options)
626
627    if verbose:
628        print("Number os keys: %d" % len(keys))
629
630    tmpl_file = args[1] if len(args) == 2 else None
631
632    if verbose:
633        print("tmpl_file = %r" % tmpl_file)
634
635    template = read_template(tmpl_file) if tmpl_file else None
636
637    if options.output:
638        outname = options.output
639    else:
640        if tmpl_file:
641            if 'tmpl' not in tmpl_file:
642                sys.exit("Hmm, template filename does not contain 'tmpl'")
643            outname = tmpl_file.replace('tmpl', 'code')
644        else:
645            outname = 'std'
646
647    if verbose:
648        print("outname = %r\n" % outname)
649
650    if outname == 'std':
651        outstream = sys.stdout
652    elif outname == 'no':
653        outstream = None
654    else:
655        try:
656            outstream = open(outname, 'w')
657        except IOError:
658            sys.exit("Error: Could not open `%s' for writing." % outname)
659
660    code = generate_code(keys, Hash, template, options)
661
662    if options.execute or template == builtin_template(Hash):
663        if verbose:
664            print('Executing code...\n')
665        run_code(code)
666
667    if outstream:
668        outstream.write(code)
669        if not outname == 'std':
670            outstream.close()
671
672
673if __name__ == '__main__':
674    main()
675