• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import json
2
3import html5lib
4
5def parse(path="html5ents.xml"):
6    return html5lib.parse(open(path), treebuilder="lxml")
7
8def entity_table(tree):
9    return dict((entity_name("".join(tr[0].xpath(".//text()"))),
10                 entity_characters(tr[1].text))
11                for tr in tree.xpath("//h:tbody/h:tr",
12                                     namespaces={"h":"http://www.w3.org/1999/xhtml"}))
13
14def entity_name(inp):
15    return inp.strip()
16
17def entity_characters(inp):
18    return "".join(codepoint_to_character(item)
19                    for item in inp.split()
20                    if item)
21
22def codepoint_to_character(inp):
23    return ("\U000"+inp[2:]).decode("unicode-escape")
24
25def make_tests_json(entities):
26    test_list = make_test_list(entities)
27    tests_json = {"tests":
28                      [make_test(*item) for item in test_list]
29                  }
30    return tests_json
31
32def make_test(name, characters, good):
33    return {
34        "description":test_description(name, good),
35        "input":"&%s"%name,
36        "output":test_expected(name, characters, good)
37        }
38
39def test_description(name, good):
40    with_semicolon = name.endswith(";")
41    semicolon_text = {True:"with a semi-colon",
42                      False:"without a semi-colon"}[with_semicolon]
43    if good:
44        text = "Named entity: %s %s"%(name, semicolon_text)
45    else:
46        text = "Bad named entity: %s %s"%(name, semicolon_text)
47    return text
48
49def test_expected(name, characters, good):
50    rv = []
51    if not good or not name.endswith(";"):
52        rv.append("ParseError")
53    rv.append(["Character", characters])
54    return rv
55
56def make_test_list(entities):
57    tests = []
58    for entity_name, characters in entities.items():
59        if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
60            tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
61        tests.append((entity_name, characters, True))
62    return sorted(tests)
63
64def subentity_exists(entity_name, entities):
65    for i in range(1, len(entity_name)):
66        if entity_name[:-i] in entities:
67            return True
68    return False
69
70def make_entities_code(entities):
71    entities_text = "\n".join("    \"%s\": u\"%s\","%(
72            name, entities[name].encode(
73                "unicode-escape").replace("\"", "\\\""))
74                              for name in sorted(entities.keys()))
75    return """entities = {
76%s
77}"""%entities_text
78
79def main():
80    entities = entity_table(parse())
81    tests_json = make_tests_json(entities)
82    json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
83    code = make_entities_code(entities)
84    open("entities_constants.py", "w").write(code)
85
86if __name__ == "__main__":
87    main()
88
89