1#!/usr/bin/env python3 2""" 3Utility for parsing HTML5 entity definitions available from: 4 5 https://html.spec.whatwg.org/entities.json 6 https://html.spec.whatwg.org/multipage/named-characters.html 7 8The page now contains the following note: 9 10 "This list is static and will not be expanded or changed in the future." 11 12Written by Ezio Melotti and Iuliia Proskurnia. 13""" 14 15import os 16import sys 17import json 18from urllib.request import urlopen 19from html.entities import html5 20 21SCRIPT_NAME = 'Tools/build/parse_html5_entities.py' 22PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' 23ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' 24HTML5_SECTION_START = '# HTML5 named character references' 25 26def get_json(url): 27 """Download the json file from the url and returns a decoded object.""" 28 with urlopen(url) as f: 29 data = f.read().decode('utf-8') 30 return json.loads(data) 31 32def create_dict(entities): 33 """Create the html5 dict from the decoded json object.""" 34 new_html5 = {} 35 for name, value in entities.items(): 36 new_html5[name.lstrip('&')] = value['characters'] 37 return new_html5 38 39def compare_dicts(old, new): 40 """Compare the old and new dicts and print the differences.""" 41 added = new.keys() - old.keys() 42 if added: 43 print('{} entitie(s) have been added:'.format(len(added))) 44 for name in sorted(added): 45 print(' {!r}: {!r}'.format(name, new[name])) 46 removed = old.keys() - new.keys() 47 if removed: 48 print('{} entitie(s) have been removed:'.format(len(removed))) 49 for name in sorted(removed): 50 print(' {!r}: {!r}'.format(name, old[name])) 51 changed = set() 52 for name in (old.keys() & new.keys()): 53 if old[name] != new[name]: 54 changed.add((name, old[name], new[name])) 55 if changed: 56 print('{} entitie(s) have been modified:'.format(len(changed))) 57 for item in sorted(changed): 58 print(' {!r}: {!r} -> {!r}'.format(*item)) 59 60def write_items(entities, file=sys.stdout): 61 """Write the items of the dictionary in the specified file.""" 62 # The keys in the generated dictionary should be sorted 63 # in a case-insensitive way, however, when two keys are equal, 64 # the uppercase version should come first so that the result 65 # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] 66 # To do this we first sort in a case-sensitive way (so all the 67 # uppercase chars come first) and then sort with key=str.lower. 68 # Since the sorting is stable the uppercase keys will eventually 69 # be before their equivalent lowercase version. 70 keys = sorted(entities.keys()) 71 keys = sorted(keys, key=str.lower) 72 print(HTML5_SECTION_START, file=file) 73 print(f'# Generated by {SCRIPT_NAME}\n' 74 f'# from {ENTITIES_URL} and\n' 75 f'# {PAGE_URL}.\n' 76 f'# Map HTML5 named character references to the ' 77 f'equivalent Unicode character(s).', file=file) 78 print('html5 = {', file=file) 79 for name in keys: 80 print(f' {name!r}: {entities[name]!a},', file=file) 81 print('}', file=file) 82 83 84if __name__ == '__main__': 85 # without args print a diff between html.entities.html5 and new_html5 86 # with --create print the new html5 dict 87 # with --patch patch the Lib/html/entities.py file 88 new_html5 = create_dict(get_json(ENTITIES_URL)) 89 if '--create' in sys.argv: 90 write_items(new_html5) 91 elif '--patch' in sys.argv: 92 fname = 'Lib/html/entities.py' 93 temp_fname = fname + '.temp' 94 with open(fname) as f1, open(temp_fname, 'w') as f2: 95 skip = False 96 for line in f1: 97 if line.startswith(HTML5_SECTION_START): 98 write_items(new_html5, file=f2) 99 skip = True 100 continue 101 if skip: 102 # skip the old items until the } 103 if line.startswith('}'): 104 skip = False 105 continue 106 f2.write(line) 107 os.remove(fname) 108 os.rename(temp_fname, fname) 109 else: 110 if html5 == new_html5: 111 print('The current dictionary is updated.') 112 else: 113 compare_dicts(html5, new_html5) 114 print('Run "./python {0} --patch" to update Lib/html/entities.html ' 115 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) 116