1#!/usr/bin/env python3 2""" 3Tools to parse data files from the Unicode Character Database. 4""" 5 6 7try: 8 from urllib.request import urlopen 9except ImportError: 10 from urllib2 import urlopen 11from contextlib import closing, contextmanager 12import re 13from codecs import iterdecode 14import logging 15import os 16from io import open 17from os.path import abspath, dirname, join as pjoin, pardir, sep 18 19 20try: # pragma: no cover 21 unicode 22except NameError: 23 unicode = str 24 25 26UNIDATA_URL = "https://unicode.org/Public/UNIDATA/" 27UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License" 28 29# by default save output files to ../Lib/fontTools/unicodedata/ 30UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir, 31 "Lib", "fontTools", "unicodedata") + sep 32 33SRC_ENCODING = "# -*- coding: utf-8 -*-\n" 34 35NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n" 36 37MAX_UNICODE = 0x10FFFF 38 39log = logging.getLogger() 40 41 42@contextmanager 43def open_unidata_file(filename): 44 """Open a text file from https://unicode.org/Public/UNIDATA/""" 45 url = UNIDATA_URL + filename 46 with closing(urlopen(url)) as response: 47 yield iterdecode(response, encoding="utf-8") 48 49 50def parse_unidata_header(infile): 51 """Read the top header of data files, until the first line 52 that does not start with '#'. 53 """ 54 header = [] 55 line = next(infile) 56 while line.startswith("#"): 57 header.append(line) 58 line = next(infile) 59 return "".join(header) 60 61 62def parse_range_properties(infile, default=None, is_set=False): 63 """Parse a Unicode data file containing a column with one character or 64 a range of characters, and another column containing a property value 65 separated by a semicolon. Comments after '#' are ignored. 66 67 If the ranges defined in the data file are not continuous, assign the 68 'default' property to the unassigned codepoints. 69 70 Return a list of (start, end, property_name) tuples. 71 """ 72 ranges = [] 73 line_regex = re.compile( 74 r"^" 75 r"([0-9A-F]{4,6})" # first character code 76 r"(?:\.\.([0-9A-F]{4,6}))?" # optional second character code 77 r"\s*;\s*" 78 r"([^#]+)") # everything up to the potential comment 79 for line in infile: 80 match = line_regex.match(line) 81 if not match: 82 continue 83 84 first, last, data = match.groups() 85 if last is None: 86 last = first 87 88 first = int(first, 16) 89 last = int(last, 16) 90 data = str(data.rstrip()) 91 92 ranges.append((first, last, data)) 93 94 ranges.sort() 95 96 if isinstance(default, unicode): 97 default = str(default) 98 99 # fill the gaps between explicitly defined ranges 100 last_start, last_end = -1, -1 101 full_ranges = [] 102 for start, end, value in ranges: 103 assert last_end < start 104 assert start <= end 105 if start - last_end > 1: 106 full_ranges.append((last_end+1, start-1, default)) 107 if is_set: 108 value = set(value.split()) 109 full_ranges.append((start, end, value)) 110 last_start, last_end = start, end 111 if last_end != MAX_UNICODE: 112 full_ranges.append((last_end+1, MAX_UNICODE, default)) 113 114 # reduce total number of ranges by combining continuous ones 115 last_start, last_end, last_value = full_ranges.pop(0) 116 merged_ranges = [] 117 for start, end, value in full_ranges: 118 if value == last_value: 119 continue 120 else: 121 merged_ranges.append((last_start, start-1, last_value)) 122 last_start, line_end, last_value = start, end, value 123 merged_ranges.append((last_start, MAX_UNICODE, last_value)) 124 125 # make sure that the ranges cover the full unicode repertoire 126 assert merged_ranges[0][0] == 0 127 for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]): 128 assert ce+1 == ns 129 assert merged_ranges[-1][1] == MAX_UNICODE 130 131 return merged_ranges 132 133 134def parse_semicolon_separated_data(infile): 135 """Parse a Unicode data file where each line contains a lists of values 136 separated by a semicolon (e.g. "PropertyValueAliases.txt"). 137 The number of the values on different lines may be different. 138 139 Returns a list of lists each containing the values as strings. 140 """ 141 data = [] 142 for line in infile: 143 line = line.split('#', 1)[0].strip() # remove the comment 144 if not line: 145 continue 146 fields = [str(field.strip()) for field in line.split(';')] 147 data.append(fields) 148 return data 149 150 151def _set_repr(value): 152 return 'None' if value is None else "{{{}}}".format( 153 ", ".join(repr(v) for v in sorted(value))) 154 155 156def build_ranges(filename, local_ucd=None, output_path=None, 157 default=None, is_set=False, aliases=None): 158 """Fetch 'filename' UCD data file from Unicode official website, parse 159 the property ranges and values and write them as two Python lists 160 to 'fontTools.unicodedata.<filename>.py'. 161 162 'aliases' is an optional mapping of property codes (short names) to long 163 name aliases (list of strings, with the first item being the preferred 164 alias). When this is provided, the property values are written using the 165 short notation, and an additional 'NAMES' dict with the aliases is 166 written to the output module. 167 168 To load the data file from a local directory, you can use the 169 'local_ucd' argument. 170 """ 171 modname = os.path.splitext(filename)[0] + ".py" 172 if not output_path: 173 output_path = UNIDATA_PATH + modname 174 175 if local_ucd: 176 log.info("loading '%s' from local directory '%s'", filename, local_ucd) 177 cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 178 else: 179 log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 180 cm = open_unidata_file(filename) 181 182 with cm as f: 183 header = parse_unidata_header(f) 184 ranges = parse_range_properties(f, default=default, is_set=is_set) 185 186 if aliases: 187 reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()} 188 max_value_length = 6 # 4-letter tags plus two quotes for repr 189 else: 190 max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges)) 191 192 with open(output_path, "w", encoding="utf-8") as f: 193 f.write(SRC_ENCODING) 194 f.write("#\n") 195 f.write(NOTICE) 196 f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) 197 f.write("# License: {}\n".format(UNIDATA_LICENSE_URL)) 198 f.write("#\n") 199 f.write(header+"\n\n") 200 201 f.write("RANGES = [\n") 202 for first, last, value in ranges: 203 f.write(" 0x{:0>4X}, # .. 0x{:0>4X} ; {}\n".format( 204 first, last, _set_repr(value) if is_set else value)) 205 f.write("]\n") 206 207 f.write("\n") 208 f.write("VALUES = [\n") 209 for first, last, value in ranges: 210 comment = "# {:0>4X}..{:0>4X}".format(first, last) 211 if is_set: 212 value_repr = "{},".format(_set_repr(value)) 213 else: 214 if aliases: 215 # append long name to comment and use the short code 216 comment += " ; {}".format(value) 217 value = reversed_aliases[normalize(value)] 218 value_repr = "{!r},".format(value) 219 f.write(" {} {}\n".format( 220 value_repr.ljust(max_value_length+1), comment)) 221 f.write("]\n") 222 223 if aliases: 224 f.write("\n") 225 f.write("NAMES = {\n") 226 for value, names in sorted(aliases.items()): 227 # we only write the first preferred alias 228 f.write(" {!r}: {!r},\n".format(value, names[0])) 229 f.write("}\n") 230 231 log.info("saved new file: '%s'", os.path.normpath(output_path)) 232 233 234_normalize_re = re.compile(r"[-_ ]+") 235 236def normalize(string): 237 """Remove case, strip space, '-' and '_' for loose matching.""" 238 return _normalize_re.sub("", string).lower() 239 240 241def parse_property_value_aliases(property_tag, local_ucd=None): 242 """Fetch the current 'PropertyValueAliases.txt' from the Unicode website, 243 parse the values for the specified 'property_tag' and return a dictionary 244 of name aliases (list of strings) keyed by short value codes (strings). 245 246 To load the data file from a local directory, you can use the 247 'local_ucd' argument. 248 """ 249 filename = "PropertyValueAliases.txt" 250 if local_ucd: 251 log.info("loading '%s' from local directory '%s'", filename, local_ucd) 252 cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 253 else: 254 log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 255 cm = open_unidata_file(filename) 256 257 with cm as f: 258 header = parse_unidata_header(f) 259 data = parse_semicolon_separated_data(f) 260 261 aliases = {item[1]: item[2:] for item in data 262 if item[0] == property_tag} 263 264 return aliases 265 266 267def main(): 268 import argparse 269 270 parser = argparse.ArgumentParser( 271 description="Generate fontTools.unicodedata from UCD data files") 272 parser.add_argument( 273 '--ucd-path', help="Path to local folder containing UCD data files") 274 parser.add_argument('-q', '--quiet', action="store_true") 275 options = parser.parse_args() 276 277 level = "WARNING" if options.quiet else "INFO" 278 logging.basicConfig(level=level, format="%(message)s") 279 280 build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block") 281 282 script_aliases = parse_property_value_aliases("sc", options.ucd_path) 283 build_ranges("Scripts.txt", local_ucd=options.ucd_path, default="Unknown", 284 aliases=script_aliases) 285 build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path, 286 is_set=True) 287 288 289if __name__ == "__main__": 290 import sys 291 sys.exit(main()) 292