1import json 2 3import html5lib 4 5def parse(path="html5ents.xml"): 6 return html5lib.parse(open(path), treebuilder="lxml") 7 8def entity_table(tree): 9 return dict((entity_name("".join(tr[0].xpath(".//text()"))), 10 entity_characters(tr[1].text)) 11 for tr in tree.xpath("//h:tbody/h:tr", 12 namespaces={"h":"http://www.w3.org/1999/xhtml"})) 13 14def entity_name(inp): 15 return inp.strip() 16 17def entity_characters(inp): 18 return "".join(codepoint_to_character(item) 19 for item in inp.split() 20 if item) 21 22def codepoint_to_character(inp): 23 return ("\U000"+inp[2:]).decode("unicode-escape") 24 25def make_tests_json(entities): 26 test_list = make_test_list(entities) 27 tests_json = {"tests": 28 [make_test(*item) for item in test_list] 29 } 30 return tests_json 31 32def make_test(name, characters, good): 33 return { 34 "description":test_description(name, good), 35 "input":"&%s"%name, 36 "output":test_expected(name, characters, good) 37 } 38 39def test_description(name, good): 40 with_semicolon = name.endswith(";") 41 semicolon_text = {True:"with a semi-colon", 42 False:"without a semi-colon"}[with_semicolon] 43 if good: 44 text = "Named entity: %s %s"%(name, semicolon_text) 45 else: 46 text = "Bad named entity: %s %s"%(name, semicolon_text) 47 return text 48 49def test_expected(name, characters, good): 50 rv = [] 51 if not good or not name.endswith(";"): 52 rv.append("ParseError") 53 rv.append(["Character", characters]) 54 return rv 55 56def make_test_list(entities): 57 tests = [] 58 for entity_name, characters in entities.items(): 59 if entity_name.endswith(";") and not subentity_exists(entity_name, entities): 60 tests.append((entity_name[:-1], "&" + entity_name[:-1], False)) 61 tests.append((entity_name, characters, True)) 62 return sorted(tests) 63 64def subentity_exists(entity_name, entities): 65 for i in range(1, len(entity_name)): 66 if entity_name[:-i] in entities: 67 return True 68 return False 69 70def make_entities_code(entities): 71 entities_text = "\n".join(" \"%s\": u\"%s\","%( 72 name, entities[name].encode( 73 "unicode-escape").replace("\"", "\\\"")) 74 for name in sorted(entities.keys())) 75 return """entities = { 76%s 77}"""%entities_text 78 79def main(): 80 entities = entity_table(parse()) 81 tests_json = make_tests_json(entities) 82 json.dump(tests_json, open("namedEntities.test", "w"), indent=4) 83 code = make_entities_code(entities) 84 open("entities_constants.py", "w").write(code) 85 86if __name__ == "__main__": 87 main() 88 89