• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #!/usr/bin/env python3
2 """ Utility for parsing HTML entity definitions available from:
3 
4       http://www.w3.org/ as e.g.
5       http://www.w3.org/TR/REC-html40/HTMLlat1.ent
6 
7     Input is read from stdin, output is written to stdout in form of a
8     Python snippet defining a dictionary "entitydefs" mapping literal
9     entity name to character or numeric entity.
10 
11     Marc-Andre Lemburg, mal@lemburg.com, 1999.
12     Use as you like. NO WARRANTIES.
13 
14 """
15 import re,sys
16 
17 entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
18 
19 def parse(text,pos=0,endpos=None):
20 
21     pos = 0
22     if endpos is None:
23         endpos = len(text)
24     d = {}
25     while 1:
26         m = entityRE.search(text,pos,endpos)
27         if not m:
28             break
29         name,charcode,comment = m.groups()
30         d[name] = charcode,comment
31         pos = m.end()
32     return d
33 
34 def writefile(f,defs):
35 
36     f.write("entitydefs = {\n")
37     items = sorted(defs.items())
38     for name, (charcode,comment) in items:
39         if charcode[:2] == '&#':
40             code = int(charcode[2:-1])
41             if code < 256:
42                 charcode = r"'\%o'" % code
43             else:
44                 charcode = repr(charcode)
45         else:
46             charcode = repr(charcode)
47         comment = ' '.join(comment.split())
48         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
49     f.write('\n}\n')
50 
51 if __name__ == '__main__':
52     if len(sys.argv) > 1:
53         with open(sys.argv[1]) as infile:
54             text = infile.read()
55     else:
56         text = sys.stdin.read()
57 
58     defs = parse(text)
59 
60     if len(sys.argv) > 2:
61         with open(sys.argv[2],'w') as outfile:
62             writefile(outfile, defs)
63     else:
64         writefile(sys.stdout, defs)
65