1#!/usr/bin/env python 2 3# This does simple normalized frequency analysis on UTF-8 encoded text. The 4# result of the analysis is translated to a ranked list, where every byte is 5# assigned a rank. This list is written to src/freqs.rs. 6# 7# Currently, the frequencies are generated from the following corpuses: 8# 9# * The CIA world fact book 10# * The source code of rustc 11# * Septuaginta 12 13from __future__ import absolute_import, division, print_function 14 15import argparse 16from collections import Counter 17import sys 18 19preamble = ''' 20// NOTE: The following code was generated by "scripts/frequencies.py", do not 21// edit directly 22'''.lstrip() 23 24 25def eprint(*args, **kwargs): 26 kwargs['file'] = sys.stderr 27 print(*args, **kwargs) 28 29 30def main(): 31 p = argparse.ArgumentParser() 32 p.add_argument('corpus', metavar='FILE', nargs='+') 33 args = p.parse_args() 34 35 # Get frequency counts of each byte. 36 freqs = Counter() 37 for i in range(0, 256): 38 freqs[i] = 0 39 40 eprint('reading entire corpus into memory') 41 corpus = [] 42 for fpath in args.corpus: 43 corpus.append(open(fpath, 'rb').read()) 44 45 eprint('computing byte frequencies') 46 for c in corpus: 47 for byte in c: 48 freqs[byte] += 1.0 / float(len(c)) 49 50 eprint('writing Rust code') 51 # Get the rank of each byte. A lower rank => lower relative frequency. 52 rank = [0] * 256 53 for i, (byte, _) in enumerate(freqs.most_common()): 54 # print(byte) 55 rank[byte] = 255 - i 56 57 # Forcefully set the highest rank possible for bytes that start multi-byte 58 # UTF-8 sequences. The idea here is that a continuation byte will be more 59 # discerning in a homogenous haystack. 60 for byte in range(0xC0, 0xFF + 1): 61 rank[byte] = 255 62 63 # Now write Rust. 64 olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] 65 for byte in range(256): 66 olines.append(' %3d, // %r' % (rank[byte], chr(byte))) 67 olines.append('];') 68 69 print(preamble) 70 print('\n'.join(olines)) 71 72 73if __name__ == '__main__': 74 main() 75