1#!/usr/bin/env python 2""" 3Tools to parse data files from the Unicode Character Database. 4""" 5 6from __future__ import print_function, absolute_import, division 7from __future__ import unicode_literals 8 9try: 10 from urllib.request import urlopen 11except ImportError: 12 from urllib2 import urlopen 13from contextlib import closing, contextmanager 14import re 15from codecs import iterdecode 16import logging 17import os 18from io import open 19from os.path import abspath, dirname, join as pjoin, pardir, sep 20 21 22try: # pragma: no cover 23 unicode 24except NameError: 25 unicode = str 26 27 28UNIDATA_URL = "https://unicode.org/Public/UNIDATA/" 29UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License" 30 31# by default save output files to ../Lib/fontTools/unicodedata/ 32UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir, 33 "Lib", "fontTools", "unicodedata") + sep 34 35SRC_ENCODING = "# -*- coding: utf-8 -*-\n" 36 37NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n" 38 39MAX_UNICODE = 0x10FFFF 40 41log = logging.getLogger() 42 43 44@contextmanager 45def open_unidata_file(filename): 46 """Open a text file from https://unicode.org/Public/UNIDATA/""" 47 url = UNIDATA_URL + filename 48 with closing(urlopen(url)) as response: 49 yield iterdecode(response, encoding="utf-8") 50 51 52def parse_unidata_header(infile): 53 """Read the top header of data files, until the first line 54 that does not start with '#'. 55 """ 56 header = [] 57 line = next(infile) 58 while line.startswith("#"): 59 header.append(line) 60 line = next(infile) 61 return "".join(header) 62 63 64def parse_range_properties(infile, default=None, is_set=False): 65 """Parse a Unicode data file containing a column with one character or 66 a range of characters, and another column containing a property value 67 separated by a semicolon. Comments after '#' are ignored. 68 69 If the ranges defined in the data file are not continuous, assign the 70 'default' property to the unassigned codepoints. 71 72 Return a list of (start, end, property_name) tuples. 73 """ 74 ranges = [] 75 line_regex = re.compile( 76 r"^" 77 r"([0-9A-F]{4,6})" # first character code 78 r"(?:\.\.([0-9A-F]{4,6}))?" # optional second character code 79 r"\s*;\s*" 80 r"([^#]+)") # everything up to the potential comment 81 for line in infile: 82 match = line_regex.match(line) 83 if not match: 84 continue 85 86 first, last, data = match.groups() 87 if last is None: 88 last = first 89 90 first = int(first, 16) 91 last = int(last, 16) 92 data = str(data.rstrip()) 93 94 ranges.append((first, last, data)) 95 96 ranges.sort() 97 98 if isinstance(default, unicode): 99 default = str(default) 100 101 # fill the gaps between explicitly defined ranges 102 last_start, last_end = -1, -1 103 full_ranges = [] 104 for start, end, value in ranges: 105 assert last_end < start 106 assert start <= end 107 if start - last_end > 1: 108 full_ranges.append((last_end+1, start-1, default)) 109 if is_set: 110 value = set(value.split()) 111 full_ranges.append((start, end, value)) 112 last_start, last_end = start, end 113 if last_end != MAX_UNICODE: 114 full_ranges.append((last_end+1, MAX_UNICODE, default)) 115 116 # reduce total number of ranges by combining continuous ones 117 last_start, last_end, last_value = full_ranges.pop(0) 118 merged_ranges = [] 119 for start, end, value in full_ranges: 120 if value == last_value: 121 continue 122 else: 123 merged_ranges.append((last_start, start-1, last_value)) 124 last_start, line_end, last_value = start, end, value 125 merged_ranges.append((last_start, MAX_UNICODE, last_value)) 126 127 # make sure that the ranges cover the full unicode repertoire 128 assert merged_ranges[0][0] == 0 129 for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]): 130 assert ce+1 == ns 131 assert merged_ranges[-1][1] == MAX_UNICODE 132 133 return merged_ranges 134 135 136def parse_semicolon_separated_data(infile): 137 """Parse a Unicode data file where each line contains a lists of values 138 separated by a semicolon (e.g. "PropertyValueAliases.txt"). 139 The number of the values on different lines may be different. 140 141 Returns a list of lists each containing the values as strings. 142 """ 143 data = [] 144 for line in infile: 145 line = line.split('#', 1)[0].strip() # remove the comment 146 if not line: 147 continue 148 fields = [str(field.strip()) for field in line.split(';')] 149 data.append(fields) 150 return data 151 152 153def _set_repr(value): 154 return 'None' if value is None else "{{{}}}".format( 155 ", ".join(repr(v) for v in sorted(value))) 156 157 158def build_ranges(filename, local_ucd=None, output_path=None, 159 default=None, is_set=False, aliases=None): 160 """Fetch 'filename' UCD data file from Unicode official website, parse 161 the property ranges and values and write them as two Python lists 162 to 'fontTools.unicodedata.<filename>.py'. 163 164 'aliases' is an optional mapping of property codes (short names) to long 165 name aliases (list of strings, with the first item being the preferred 166 alias). When this is provided, the property values are written using the 167 short notation, and an additional 'NAMES' dict with the aliases is 168 written to the output module. 169 170 To load the data file from a local directory, you can use the 171 'local_ucd' argument. 172 """ 173 modname = os.path.splitext(filename)[0] + ".py" 174 if not output_path: 175 output_path = UNIDATA_PATH + modname 176 177 if local_ucd: 178 log.info("loading '%s' from local directory '%s'", filename, local_ucd) 179 cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 180 else: 181 log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 182 cm = open_unidata_file(filename) 183 184 with cm as f: 185 header = parse_unidata_header(f) 186 ranges = parse_range_properties(f, default=default, is_set=is_set) 187 188 if aliases: 189 reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()} 190 max_value_length = 6 # 4-letter tags plus two quotes for repr 191 else: 192 max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges)) 193 194 with open(output_path, "w", encoding="utf-8") as f: 195 f.write(SRC_ENCODING) 196 f.write("#\n") 197 f.write(NOTICE) 198 f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) 199 f.write("# License: {}\n".format(UNIDATA_LICENSE_URL)) 200 f.write("#\n") 201 f.write(header+"\n\n") 202 203 f.write("RANGES = [\n") 204 for first, last, value in ranges: 205 f.write(" 0x{:0>4X}, # .. 0x{:0>4X} ; {}\n".format( 206 first, last, _set_repr(value) if is_set else value)) 207 f.write("]\n") 208 209 f.write("\n") 210 f.write("VALUES = [\n") 211 for first, last, value in ranges: 212 comment = "# {:0>4X}..{:0>4X}".format(first, last) 213 if is_set: 214 value_repr = "{},".format(_set_repr(value)) 215 else: 216 if aliases: 217 # append long name to comment and use the short code 218 comment += " ; {}".format(value) 219 value = reversed_aliases[normalize(value)] 220 value_repr = "{!r},".format(value) 221 f.write(" {} {}\n".format( 222 value_repr.ljust(max_value_length+1), comment)) 223 f.write("]\n") 224 225 if aliases: 226 f.write("\n") 227 f.write("NAMES = {\n") 228 for value, names in sorted(aliases.items()): 229 # we only write the first preferred alias 230 f.write(" {!r}: {!r},\n".format(value, names[0])) 231 f.write("}\n") 232 233 log.info("saved new file: '%s'", os.path.normpath(output_path)) 234 235 236_normalize_re = re.compile(r"[-_ ]+") 237 238def normalize(string): 239 """Remove case, strip space, '-' and '_' for loose matching.""" 240 return _normalize_re.sub("", string).lower() 241 242 243def parse_property_value_aliases(property_tag, local_ucd=None): 244 """Fetch the current 'PropertyValueAliases.txt' from the Unicode website, 245 parse the values for the specified 'property_tag' and return a dictionary 246 of name aliases (list of strings) keyed by short value codes (strings). 247 248 To load the data file from a local directory, you can use the 249 'local_ucd' argument. 250 """ 251 filename = "PropertyValueAliases.txt" 252 if local_ucd: 253 log.info("loading '%s' from local directory '%s'", filename, local_ucd) 254 cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 255 else: 256 log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 257 cm = open_unidata_file(filename) 258 259 with cm as f: 260 header = parse_unidata_header(f) 261 data = parse_semicolon_separated_data(f) 262 263 aliases = {item[1]: item[2:] for item in data 264 if item[0] == property_tag} 265 266 return aliases 267 268 269def main(): 270 import argparse 271 272 parser = argparse.ArgumentParser( 273 description="Generate fontTools.unicodedata from UCD data files") 274 parser.add_argument( 275 '--ucd-path', help="Path to local folder containing UCD data files") 276 parser.add_argument('-q', '--quiet', action="store_true") 277 options = parser.parse_args() 278 279 level = "WARNING" if options.quiet else "INFO" 280 logging.basicConfig(level=level, format="%(message)s") 281 282 build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block") 283 284 script_aliases = parse_property_value_aliases("sc", options.ucd_path) 285 build_ranges("Scripts.txt", local_ucd=options.ucd_path, default="Unknown", 286 aliases=script_aliases) 287 build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path, 288 is_set=True) 289 290 291if __name__ == "__main__": 292 import sys 293 sys.exit(main()) 294