1# Derived from: https://github.com/ilanschnell/perfect-hash 2# Commit: 6b7dd80a525dbd4349ea2c69f04a9c96f3c2fd54 3 4# BSD 3-Clause License 5# 6# Copyright (c) 2019 - 2021, Ilan Schnell 7# All rights reserved. 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions are met: 11# * Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# * Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# * Neither the name of the Ilan Schnell nor the 17# names of its contributors may be used to endorse or promote products 18# derived from this software without specific prior written permission. 19# 20# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23# DISCLAIMED. IN NO EVENT SHALL ILAN SCHNELL BE LIABLE FOR ANY 24# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 27# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31""" 32Generate a minimal perfect hash function for the keys in a file, 33desired hash values may be specified within this file as well. 34A given code template is filled with parameters, such that the 35output is code which implements the hash function. 36Templates can easily be constructed for any programming language. 37 38The code is based on an a program A.M. Kuchling wrote: 39http://www.amk.ca/python/code/perfect-hash 40 41The algorithm the program uses is described in the paper 42'Optimal algorithms for minimal perfect hashing', 43Z. J. Czech, G. Havas and B.S. Majewski. 44http://citeseer.ist.psu.edu/122364.html 45 46The algorithm works like this: 47 481. You have K keys, that you want to perfectly hash against some 49 desired hash values. 50 512. Choose a number N larger than K. This is the number of 52 vertices in a graph G, and also the size of the resulting table G. 53 543. Pick two random hash functions f1, f2, that return values from 0..N-1. 55 564. Now, for all keys, you draw an edge between vertices f1(key) and f2(key) 57 of the graph G, and associate the desired hash value with that edge. 58 595. If G is cyclic, go back to step 2. 60 616. Assign values to each vertex such that, for each edge, you can add 62 the values for the two vertices and get the desired (hash) value 63 for that edge. This task is easy, because the graph is acyclic. 64 This is done by picking a vertex, and assigning it a value of 0. 65 Then do a depth-first search, assigning values to new vertices so that 66 they sum up properly. 67 687. f1, f2, and vertex values of G now make up a perfect hash function. 69 70 71For simplicity, the implementation of the algorithm combines steps 5 and 6. 72That is, we check for loops in G and assign the vertex values in one procedure. 73If this procedure succeeds, G is acyclic and the vertex values are assigned. 74If the procedure fails, G is cyclic, and we go back to step 2, replacing G 75with a new graph, and thereby discarding the vertex values from the failed 76attempt. 77""" 78from __future__ import absolute_import, division, print_function 79 80import sys 81import random 82import string 83import subprocess 84import shutil 85import tempfile 86from collections import defaultdict 87from os.path import join 88 89if sys.version_info[0] == 2: 90 from cStringIO import StringIO 91else: 92 from io import StringIO 93 94 95__version__ = '0.4.2' 96 97 98verbose = False 99trials = 150 100 101 102class Graph(object): 103 """ 104 Implements a graph with 'N' vertices. First, you connect the graph with 105 edges, which have a desired value associated. Then the vertex values 106 are assigned, which will fail if the graph is cyclic. The vertex values 107 are assigned such that the two values corresponding to an edge add up to 108 the desired edge value (mod N). 109 """ 110 def __init__(self, N): 111 self.N = N # number of vertices 112 113 # maps a vertex number to the list of tuples (vertex, edge value) 114 # to which it is connected by edges. 115 self.adjacent = defaultdict(list) 116 117 def connect(self, vertex1, vertex2, edge_value): 118 """ 119 Connect 'vertex1' and 'vertex2' with an edge, with associated 120 value 'value' 121 """ 122 # Add vertices to each other's adjacent list 123 self.adjacent[vertex1].append((vertex2, edge_value)) 124 self.adjacent[vertex2].append((vertex1, edge_value)) 125 126 def assign_vertex_values(self): 127 """ 128 Try to assign the vertex values, such that, for each edge, you can 129 add the values for the two vertices involved and get the desired 130 value for that edge, i.e. the desired hash key. 131 This will fail when the graph is cyclic. 132 133 This is done by a Depth-First Search of the graph. If the search 134 finds a vertex that was visited before, there's a loop and False is 135 returned immediately, i.e. the assignment is terminated. 136 On success (when the graph is acyclic) True is returned. 137 """ 138 self.vertex_values = self.N * [-1] # -1 means unassigned 139 140 visited = self.N * [False] 141 142 # Loop over all vertices, taking unvisited ones as roots. 143 for root in range(self.N): 144 if visited[root]: 145 continue 146 147 # explore tree starting at 'root' 148 self.vertex_values[root] = 0 # set arbitrarily to zero 149 150 # Stack of vertices to visit, a list of tuples (parent, vertex) 151 tovisit = [(None, root)] 152 while tovisit: 153 parent, vertex = tovisit.pop() 154 visited[vertex] = True 155 156 # Loop over adjacent vertices, but skip the vertex we arrived 157 # here from the first time it is encountered. 158 skip = True 159 for neighbor, edge_value in self.adjacent[vertex]: 160 if skip and neighbor == parent: 161 skip = False 162 continue 163 164 if visited[neighbor]: 165 # We visited here before, so the graph is cyclic. 166 return False 167 168 tovisit.append((vertex, neighbor)) 169 170 # Set new vertex's value to the desired edge value, 171 # minus the value of the vertex we came here from. 172 self.vertex_values[neighbor] = ( 173 edge_value - self.vertex_values[vertex]) % self.N 174 175 # check if all vertices have a valid value 176 for vertex in range(self.N): 177 assert self.vertex_values[vertex] >= 0 178 179 # We got though, so the graph is acyclic, 180 # and all values are now assigned. 181 return True 182 183 184class StrSaltHash(object): 185 """ 186 Random hash function generator. 187 Simple byte level hashing: each byte is multiplied to another byte from 188 a random string of characters, summed up, and finally modulo NG is 189 taken. 190 """ 191 chars = string.ascii_letters + string.digits 192 193 def __init__(self, N): 194 self.N = N 195 self.salt = '' 196 197 def __call__(self, key): 198 # XXX: xkbcommon modification: make the salt length a power of 2 199 # so that the % operation in the hash is fast. 200 while len(self.salt) < max(len(key), 32): # add more salt as necessary 201 self.salt += random.choice(self.chars) 202 203 return sum(ord(self.salt[i]) * ord(c) 204 for i, c in enumerate(key)) % self.N 205 206 template = """ 207def hash_f(key, T): 208 return sum(ord(T[i % $NS]) * ord(c) for i, c in enumerate(key)) % $NG 209 210def perfect_hash(key): 211 return (G[hash_f(key, "$S1")] + 212 G[hash_f(key, "$S2")]) % $NG 213""" 214 215class IntSaltHash(object): 216 """ 217 Random hash function generator. 218 Simple byte level hashing, each byte is multiplied in sequence to a table 219 containing random numbers, summed tp, and finally modulo NG is taken. 220 """ 221 def __init__(self, N): 222 self.N = N 223 self.salt = [] 224 225 def __call__(self, key): 226 while len(self.salt) < len(key): # add more salt as necessary 227 self.salt.append(random.randint(1, self.N - 1)) 228 229 return sum(self.salt[i] * ord(c) 230 for i, c in enumerate(key)) % self.N 231 232 template = """ 233S1 = [$S1] 234S2 = [$S2] 235assert len(S1) == len(S2) == $NS 236 237def hash_f(key, T): 238 return sum(T[i % $NS] * ord(c) for i, c in enumerate(key)) % $NG 239 240def perfect_hash(key): 241 return (G[hash_f(key, S1)] + G[hash_f(key, S2)]) % $NG 242""" 243 244def builtin_template(Hash): 245 return """\ 246# ======================================================================= 247# ================= Python code for perfect hash function =============== 248# ======================================================================= 249 250G = [$G] 251""" + Hash.template + """ 252# ============================ Sanity check ============================= 253 254K = [$K] 255assert len(K) == $NK 256 257for h, k in enumerate(K): 258 assert perfect_hash(k) == h 259""" 260 261 262class TooManyInterationsError(Exception): 263 pass 264 265 266def generate_hash(keys, Hash=StrSaltHash): 267 """ 268 Return hash functions f1 and f2, and G for a perfect minimal hash. 269 Input is an iterable of 'keys', whos indicies are the desired hash values. 270 'Hash' is a random hash function generator, that means Hash(N) returns a 271 returns a random hash function which returns hash values from 0..N-1. 272 """ 273 if not isinstance(keys, (list, tuple)): 274 raise TypeError("list or tuple expected") 275 NK = len(keys) 276 if NK != len(set(keys)): 277 raise ValueError("duplicate keys") 278 for key in keys: 279 if not isinstance(key, str): 280 raise TypeError("key a not string: %r" % key) 281 if NK > 10000 and Hash == StrSaltHash: 282 print("""\ 283WARNING: You have %d keys. 284 Using --hft=1 is likely to fail for so many keys. 285 Please use --hft=2 instead. 286""" % NK) 287 288 # the number of vertices in the graph G 289 NG = NK + 1 290 if verbose: 291 print('NG = %d' % NG) 292 293 trial = 0 # Number of trial graphs so far 294 while True: 295 if (trial % trials) == 0: # trials failures, increase NG slightly 296 if trial > 0: 297 NG = max(NG + 1, int(1.05 * NG)) 298 if verbose: 299 sys.stdout.write('\nGenerating graphs NG = %d ' % NG) 300 trial += 1 301 302 if NG > 100 * (NK + 1): 303 raise TooManyInterationsError("%d keys" % NK) 304 305 if verbose: 306 sys.stdout.write('.') 307 sys.stdout.flush() 308 309 G = Graph(NG) # Create graph with NG vertices 310 f1 = Hash(NG) # Create 2 random hash functions 311 f2 = Hash(NG) 312 313 # Connect vertices given by the values of the two hash functions 314 # for each key. Associate the desired hash value with each edge. 315 for hashval, key in enumerate(keys): 316 G.connect(f1(key), f2(key), hashval) 317 318 # Try to assign the vertex values. This will fail when the graph 319 # is cyclic. But when the graph is acyclic it will succeed and we 320 # break out, because we're done. 321 if G.assign_vertex_values(): 322 break 323 324 if verbose: 325 print('\nAcyclic graph found after %d trials.' % trial) 326 print('NG = %d' % NG) 327 328 # Sanity check the result by actually verifying that all the keys 329 # hash to the right value. 330 for hashval, key in enumerate(keys): 331 assert hashval == ( 332 G.vertex_values[f1(key)] + G.vertex_values[f2(key)] 333 ) % NG 334 335 if verbose: 336 print('OK') 337 338 return f1, f2, G.vertex_values 339 340 341class Format(object): 342 343 def __init__(self, width=76, indent=4, delimiter=', '): 344 self.width = width 345 self.indent = indent 346 self.delimiter = delimiter 347 348 def print_format(self): 349 print("Format options:") 350 for name in 'width', 'indent', 'delimiter': 351 print(' %s: %r' % (name, getattr(self, name))) 352 353 def __call__(self, data, quote=False): 354 if not isinstance(data, (list, tuple)): 355 return str(data) 356 357 lendel = len(self.delimiter) 358 aux = StringIO() 359 pos = 20 360 for i, elt in enumerate(data): 361 last = bool(i == len(data) - 1) 362 363 s = ('"%s"' if quote else '%s') % elt 364 365 if pos + len(s) + lendel > self.width: 366 aux.write('\n' + (self.indent * ' ')) 367 pos = self.indent 368 369 aux.write(s) 370 pos += len(s) 371 if not last: 372 aux.write(self.delimiter) 373 pos += lendel 374 375 return '\n'.join(l.rstrip() for l in aux.getvalue().split('\n')) 376 377 378def generate_code(keys, Hash=StrSaltHash, template=None, options=None): 379 """ 380 Takes a list of key value pairs and inserts the generated parameter 381 lists into the 'template' string. 'Hash' is the random hash function 382 generator, and the optional keywords are formating options. 383 The return value is the substituted code template. 384 """ 385 f1, f2, G = generate_hash(keys, Hash) 386 387 assert f1.N == f2.N == len(G) 388 try: 389 salt_len = len(f1.salt) 390 assert salt_len == len(f2.salt) 391 except TypeError: 392 salt_len = None 393 394 if template is None: 395 template = builtin_template(Hash) 396 397 if options is None: 398 fmt = Format() 399 else: 400 fmt = Format(width=options.width, indent=options.indent, 401 delimiter=options.delimiter) 402 403 if verbose: 404 fmt.print_format() 405 406 return string.Template(template).substitute( 407 NS = salt_len, 408 S1 = fmt(f1.salt), 409 S2 = fmt(f2.salt), 410 NG = len(G), 411 G = fmt(G), 412 NK = len(keys), 413 K = fmt(list(keys), quote=True)) 414 415 416def read_table(filename, options): 417 """ 418 Reads keys and desired hash value pairs from a file. If no column 419 for the hash value is specified, a sequence of hash values is generated, 420 from 0 to N-1, where N is the number of rows found in the file. 421 """ 422 if verbose: 423 print("Reading table from file `%s' to extract keys." % filename) 424 try: 425 fi = open(filename) 426 except IOError: 427 sys.exit("Error: Could not open `%s' for reading." % filename) 428 429 keys = [] 430 431 if verbose: 432 print("Reader options:") 433 for name in 'comment', 'splitby', 'keycol': 434 print(' %s: %r' % (name, getattr(options, name))) 435 436 for n, line in enumerate(fi): 437 line = line.strip() 438 if not line or line.startswith(options.comment): 439 continue 440 441 if line.count(options.comment): # strip content after comment 442 line = line.split(options.comment)[0].strip() 443 444 row = [col.strip() for col in line.split(options.splitby)] 445 446 try: 447 key = row[options.keycol - 1] 448 except IndexError: 449 sys.exit("%s:%d: Error: Cannot read key, not enough columns." % 450 (filename, n + 1)) 451 452 keys.append(key) 453 454 fi.close() 455 456 if not keys: 457 exit("Error: no keys found in file `%s'." % filename) 458 459 return keys 460 461 462def read_template(filename): 463 if verbose: 464 print("Reading template from file `%s'" % filename) 465 try: 466 with open(filename, 'r') as fi: 467 return fi.read() 468 except IOError: 469 sys.exit("Error: Could not open `%s' for reading." % filename) 470 471 472def run_code(code): 473 tmpdir = tempfile.mkdtemp() 474 path = join(tmpdir, 't.py') 475 with open(path, 'w') as fo: 476 fo.write(code) 477 try: 478 subprocess.check_call([sys.executable, path]) 479 except subprocess.CalledProcessError as e: 480 raise AssertionError(e) 481 finally: 482 shutil.rmtree(tmpdir) 483 484 485def main(): 486 from optparse import OptionParser 487 488 usage = "usage: %prog [options] KEYS_FILE [TMPL_FILE]" 489 490 description = """\ 491Generates code for perfect hash functions from 492a file with keywords and a code template. 493If no template file is provided, a small built-in Python template 494is processed and the output code is written to stdout. 495""" 496 497 parser = OptionParser(usage = usage, 498 description = description, 499 prog = sys.argv[0], 500 version = "%prog: " + __version__) 501 502 parser.add_option("--delimiter", 503 action = "store", 504 default = ", ", 505 help = "Delimiter for list items used in output, " 506 "the default delimiter is '%default'", 507 metavar = "STR") 508 509 parser.add_option("--indent", 510 action = "store", 511 default = 4, 512 type = "int", 513 help = "Make INT spaces at the beginning of a " 514 "new line when generated list is wrapped. " 515 "Default is %default", 516 metavar = "INT") 517 518 parser.add_option("--width", 519 action = "store", 520 default = 76, 521 type = "int", 522 help = "Maximal width of generated list when " 523 "wrapped. Default width is %default", 524 metavar = "INT") 525 526 parser.add_option("--comment", 527 action = "store", 528 default = "#", 529 help = "STR is the character, or sequence of " 530 "characters, which marks the beginning " 531 "of a comment (which runs till " 532 "the end of the line), in the input " 533 "KEYS_FILE. " 534 "Default is '%default'", 535 metavar = "STR") 536 537 parser.add_option("--splitby", 538 action = "store", 539 default = ",", 540 help = "STR is the character by which the columns " 541 "in the input KEYS_FILE are split. " 542 "Default is '%default'", 543 metavar = "STR") 544 545 parser.add_option("--keycol", 546 action = "store", 547 default = 1, 548 type = "int", 549 help = "Specifies the column INT in the input " 550 "KEYS_FILE which contains the keys. " 551 "Default is %default, i.e. the first column.", 552 metavar = "INT") 553 554 parser.add_option("--trials", 555 action = "store", 556 default = 5, 557 type = "int", 558 help = "Specifies the number of trials before " 559 "NG is increased. A small INT will give " 560 "compute faster, but the array G will be " 561 "large. A large INT will take longer to " 562 "compute but G will be smaller. " 563 "Default is %default", 564 metavar = "INT") 565 566 parser.add_option("--hft", 567 action = "store", 568 default = 1, 569 type = "int", 570 help = "Hash function type INT. Possible values " 571 "are 1 (StrSaltHash) and 2 (IntSaltHash). " 572 "The default is %default", 573 metavar = "INT") 574 575 parser.add_option("-e", "--execute", 576 action = "store_true", 577 help = "Execute the generated code within " 578 "the Python interpreter.") 579 580 parser.add_option("-o", "--output", 581 action = "store", 582 help = "Specify output FILE explicitly. " 583 "`-o std' means standard output. " 584 "`-o no' means no output. " 585 "By default, the file name is obtained " 586 "from the name of the template file by " 587 "substituting `tmpl' to `code'.", 588 metavar = "FILE") 589 590 parser.add_option("-v", "--verbose", 591 action = "store_true", 592 help = "verbosity") 593 594 options, args = parser.parse_args() 595 596 if options.trials <= 0: 597 parser.error("trials before increasing N has to be larger than zero") 598 599 global trials 600 trials = options.trials 601 602 global verbose 603 verbose = options.verbose 604 605 if len(args) not in (1, 2): 606 parser.error("incorrect number of arguments") 607 608 if len(args) == 2 and not args[1].count('tmpl'): 609 parser.error("template filename does not contain 'tmpl'") 610 611 if options.hft == 1: 612 Hash = StrSaltHash 613 elif options.hft == 2: 614 Hash = IntSaltHash 615 else: 616 parser.error("Hash function %s not implemented." % options.hft) 617 618 # --------------------- end parsing and checking -------------- 619 620 keys_file = args[0] 621 622 if verbose: 623 print("keys_file = %r" % keys_file) 624 625 keys = read_table(keys_file, options) 626 627 if verbose: 628 print("Number os keys: %d" % len(keys)) 629 630 tmpl_file = args[1] if len(args) == 2 else None 631 632 if verbose: 633 print("tmpl_file = %r" % tmpl_file) 634 635 template = read_template(tmpl_file) if tmpl_file else None 636 637 if options.output: 638 outname = options.output 639 else: 640 if tmpl_file: 641 if 'tmpl' not in tmpl_file: 642 sys.exit("Hmm, template filename does not contain 'tmpl'") 643 outname = tmpl_file.replace('tmpl', 'code') 644 else: 645 outname = 'std' 646 647 if verbose: 648 print("outname = %r\n" % outname) 649 650 if outname == 'std': 651 outstream = sys.stdout 652 elif outname == 'no': 653 outstream = None 654 else: 655 try: 656 outstream = open(outname, 'w') 657 except IOError: 658 sys.exit("Error: Could not open `%s' for writing." % outname) 659 660 code = generate_code(keys, Hash, template, options) 661 662 if options.execute or template == builtin_template(Hash): 663 if verbose: 664 print('Executing code...\n') 665 run_code(code) 666 667 if outstream: 668 outstream.write(code) 669 if not outname == 'std': 670 outstream.close() 671 672 673if __name__ == '__main__': 674 main() 675