• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2"""
3Tools to parse data files from the Unicode Character Database.
4"""
5
6
7try:
8    from urllib.request import urlopen
9except ImportError:
10    from urllib2 import urlopen
11from contextlib import closing, contextmanager
12import re
13from codecs import iterdecode
14import logging
15import os
16from io import open
17from os.path import abspath, dirname, join as pjoin, pardir, sep
18
19
20try:  # pragma: no cover
21    unicode
22except NameError:
23    unicode = str
24
25
26UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
27UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
28
29# by default save output files to ../Lib/fontTools/unicodedata/
30UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir,
31                     "Lib", "fontTools", "unicodedata") + sep
32
33SRC_ENCODING = "# -*- coding: utf-8 -*-\n"
34
35NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"
36
37MAX_UNICODE = 0x10FFFF
38
39log = logging.getLogger()
40
41
42@contextmanager
43def open_unidata_file(filename):
44    """Open a text file from https://unicode.org/Public/UNIDATA/"""
45    url = UNIDATA_URL + filename
46    with closing(urlopen(url)) as response:
47        yield iterdecode(response, encoding="utf-8")
48
49
50def parse_unidata_header(infile):
51    """Read the top header of data files, until the first line
52    that does not start with '#'.
53    """
54    header = []
55    line = next(infile)
56    while line.startswith("#"):
57        header.append(line)
58        line = next(infile)
59    return "".join(header)
60
61
62def parse_range_properties(infile, default=None, is_set=False):
63    """Parse a Unicode data file containing a column with one character or
64    a range of characters, and another column containing a property value
65    separated by a semicolon. Comments after '#' are ignored.
66
67    If the ranges defined in the data file are not continuous, assign the
68    'default' property to the unassigned codepoints.
69
70    Return a list of (start, end, property_name) tuples.
71    """
72    ranges = []
73    line_regex = re.compile(
74        r"^"
75        r"([0-9A-F]{4,6})"  # first character code
76        r"(?:\.\.([0-9A-F]{4,6}))?"  # optional second character code
77        r"\s*;\s*"
78        r"([^#]+)")  # everything up to the potential comment
79    for line in infile:
80        match = line_regex.match(line)
81        if not match:
82            continue
83
84        first, last, data = match.groups()
85        if last is None:
86            last = first
87
88        first = int(first, 16)
89        last = int(last, 16)
90        data = str(data.rstrip())
91
92        ranges.append((first, last, data))
93
94    ranges.sort()
95
96    if isinstance(default, unicode):
97        default = str(default)
98
99    # fill the gaps between explicitly defined ranges
100    last_start, last_end = -1, -1
101    full_ranges = []
102    for start, end, value in ranges:
103        assert last_end < start
104        assert start <= end
105        if start - last_end > 1:
106            full_ranges.append((last_end+1, start-1, default))
107        if is_set:
108            value = set(value.split())
109        full_ranges.append((start, end, value))
110        last_start, last_end = start, end
111    if last_end != MAX_UNICODE:
112        full_ranges.append((last_end+1, MAX_UNICODE, default))
113
114    # reduce total number of ranges by combining continuous ones
115    last_start, last_end, last_value = full_ranges.pop(0)
116    merged_ranges = []
117    for start, end, value in full_ranges:
118        if value == last_value:
119            continue
120        else:
121            merged_ranges.append((last_start, start-1, last_value))
122            last_start, line_end, last_value = start, end, value
123    merged_ranges.append((last_start, MAX_UNICODE, last_value))
124
125    # make sure that the ranges cover the full unicode repertoire
126    assert merged_ranges[0][0] == 0
127    for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]):
128        assert ce+1 == ns
129    assert merged_ranges[-1][1] == MAX_UNICODE
130
131    return merged_ranges
132
133
134def parse_semicolon_separated_data(infile):
135    """Parse a Unicode data file where each line contains a lists of values
136    separated by a semicolon (e.g. "PropertyValueAliases.txt").
137    The number of the values on different lines may be different.
138
139    Returns a list of lists each containing the values as strings.
140    """
141    data = []
142    for line in infile:
143        line = line.split('#', 1)[0].strip()  # remove the comment
144        if not line:
145            continue
146        fields = [str(field.strip()) for field in line.split(';')]
147        data.append(fields)
148    return data
149
150
151def _set_repr(value):
152    return 'None' if value is None else "{{{}}}".format(
153        ", ".join(repr(v) for v in sorted(value)))
154
155
156def build_ranges(filename, local_ucd=None, output_path=None,
157                 default=None, is_set=False, aliases=None):
158    """Fetch 'filename' UCD data file from Unicode official website, parse
159    the property ranges and values and write them as two Python lists
160    to 'fontTools.unicodedata.<filename>.py'.
161
162    'aliases' is an optional mapping of property codes (short names) to long
163    name aliases (list of strings, with the first item being the preferred
164    alias). When this is provided, the property values are written using the
165    short notation, and an additional 'NAMES' dict with the aliases is
166    written to the output module.
167
168    To load the data file from a local directory, you can use the
169    'local_ucd' argument.
170    """
171    modname = os.path.splitext(filename)[0] + ".py"
172    if not output_path:
173        output_path = UNIDATA_PATH + modname
174
175    if local_ucd:
176        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
177        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
178    else:
179        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
180        cm = open_unidata_file(filename)
181
182    with cm as f:
183        header = parse_unidata_header(f)
184        ranges = parse_range_properties(f, default=default, is_set=is_set)
185
186    if aliases:
187        reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
188        max_value_length = 6  # 4-letter tags plus two quotes for repr
189    else:
190        max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges))
191
192    with open(output_path, "w", encoding="utf-8") as f:
193        f.write(SRC_ENCODING)
194        f.write("#\n")
195        f.write(NOTICE)
196        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
197        f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
198        f.write("#\n")
199        f.write(header+"\n\n")
200
201        f.write("RANGES = [\n")
202        for first, last, value in ranges:
203            f.write("    0x{:0>4X},  # .. 0x{:0>4X} ; {}\n".format(
204                first, last, _set_repr(value) if is_set else value))
205        f.write("]\n")
206
207        f.write("\n")
208        f.write("VALUES = [\n")
209        for first, last, value in ranges:
210            comment = "# {:0>4X}..{:0>4X}".format(first, last)
211            if is_set:
212                value_repr = "{},".format(_set_repr(value))
213            else:
214                if aliases:
215                    # append long name to comment and use the short code
216                    comment += " ; {}".format(value)
217                    value = reversed_aliases[normalize(value)]
218                value_repr = "{!r},".format(value)
219            f.write("    {}  {}\n".format(
220                value_repr.ljust(max_value_length+1), comment))
221        f.write("]\n")
222
223        if aliases:
224            f.write("\n")
225            f.write("NAMES = {\n")
226            for value, names in sorted(aliases.items()):
227                # we only write the first preferred alias
228                f.write("    {!r}: {!r},\n".format(value, names[0]))
229            f.write("}\n")
230
231    log.info("saved new file: '%s'", os.path.normpath(output_path))
232
233
234_normalize_re = re.compile(r"[-_ ]+")
235
236def normalize(string):
237    """Remove case, strip space, '-' and '_' for loose matching."""
238    return _normalize_re.sub("", string).lower()
239
240
241def parse_property_value_aliases(property_tag, local_ucd=None):
242    """Fetch the current 'PropertyValueAliases.txt' from the Unicode website,
243    parse the values for the specified 'property_tag' and return a dictionary
244    of name aliases (list of strings) keyed by short value codes (strings).
245
246    To load the data file from a local directory, you can use the
247    'local_ucd' argument.
248    """
249    filename = "PropertyValueAliases.txt"
250    if local_ucd:
251        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
252        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
253    else:
254        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
255        cm = open_unidata_file(filename)
256
257    with cm as f:
258        header = parse_unidata_header(f)
259        data = parse_semicolon_separated_data(f)
260
261    aliases = {item[1]: item[2:] for item in data
262               if item[0] == property_tag}
263
264    return aliases
265
266
267def main():
268    import argparse
269
270    parser = argparse.ArgumentParser(
271        description="Generate fontTools.unicodedata from UCD data files")
272    parser.add_argument(
273        '--ucd-path', help="Path to local folder containing UCD data files")
274    parser.add_argument('-q', '--quiet', action="store_true")
275    options = parser.parse_args()
276
277    level = "WARNING" if options.quiet else "INFO"
278    logging.basicConfig(level=level, format="%(message)s")
279
280    build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block")
281
282    script_aliases = parse_property_value_aliases("sc", options.ucd_path)
283    build_ranges("Scripts.txt", local_ucd=options.ucd_path, default="Unknown",
284                 aliases=script_aliases)
285    build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path,
286                 is_set=True)
287
288
289if __name__ == "__main__":
290    import sys
291    sys.exit(main())
292