• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python -B
2# -*- coding: utf-8 -*-
3# © 2016 and later: Unicode, Inc. and others.
4# License & terms of use: http://www.unicode.org/copyright.html
5# Copyright (c) 2009-2016 International Business Machines
6# Corporation and others. All Rights Reserved.
7#
8#   file name:  preparseucd.py
9#   encoding:   US-ASCII
10#   tab size:   8 (not used)
11#   indentation:4
12#
13#   created on: 2011nov03 (forked from ucdcopy.py)
14#   created by: Markus W. Scherer
15#
16# Copies Unicode Character Database (UCD) files from a tree
17# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/
18# to ICU's source/data/unidata/ and source/test/testdata/
19# and modifies some of the files to make them more compact.
20# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
21#
22# Invoke with two command-line parameters:
23# 1. source folder with UCD & idna files
24# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools)
25#
26# Sample invocation:
27#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src
28
29import array
30import bisect
31import codecs
32import os
33import os.path
34import re
35import shutil
36import sys
37
38# Unicode version ---------------------------------------------------------- ***
39
40_ucd_version = "?"
41
42# ISO 15924 script codes --------------------------------------------------- ***
43
44# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
45# that are not yet in the UCD.
46_scripts_only_in_iso15924 = (
47    "Afak", "Blis", "Cirt", "Cyrs",
48    "Egyd", "Egyh", "Geok",
49    "Hanb", "Hans", "Hant",
50    "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
51    "Maya", "Moon", "Nkgb", "Phlv", "Roro",
52    "Sara", "Syre", "Syrj", "Syrn",
53    "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx"
54)
55
56# Properties --------------------------------------------------------------- ***
57
58# Properties that we do not want to store in ppucd.txt.
59# Not a frozenset so that we can add aliases for simpler subsequent testing.
60_ignored_properties = set((
61  # Other_Xyz only contribute to Xyz, store only the latter.
62  "OAlpha",
63  "ODI",
64  "OGr_Ext",
65  "OIDC",
66  "OIDS",
67  "OLower",
68  "OMath",
69  "OUpper",
70  # Further properties that just contribute to others.
71  "CE",  # Composition_Exclusion just contributes to Full_Composition_Exclusion.
72  "JSN",
73  # These properties just don't seem useful.
74  # They are deprecated since Unicode 6.0.
75  "XO_NFC",
76  "XO_NFD",
77  "XO_NFKC",
78  "XO_NFKD",
79  # ICU does not use Unihan properties.
80  "cjkAccountingNumeric",
81  "cjkOtherNumeric",
82  "cjkPrimaryNumeric",
83  "cjkCompatibilityVariant",
84  "cjkIICore",
85  "cjkIRG_GSource",
86  "cjkIRG_HSource",
87  "cjkIRG_JSource",
88  "cjkIRG_KPSource",
89  "cjkIRG_KSource",
90  "cjkIRG_MSource",
91  "cjkIRG_SSource",
92  "cjkIRG_TSource",
93  "cjkIRG_UKSource",
94  "cjkIRG_USource",
95  "cjkIRG_VSource",
96  "cjkRSUnicode"
97))
98
99# These properties (short names) map code points to
100# strings or other unusual values (property types String or Miscellaneous)
101# that cannot be block-compressed (or would be confusing).
102_uncompressible_props = frozenset((
103  "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC",
104  "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF",
105  # scx is block-compressible.
106  "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc"
107))
108
109# Dictionary of properties.
110# Keyed by normalized property names and aliases.
111# Each value is a tuple with
112# 0: Type of property (binary, enum, ...)
113# 1: List of aliases; short & long name followed by other aliases.
114#    The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
115# 2: Set of short property value names.
116# 3: Dictionary of property values.
117#    For Catalog & Enumerated properties,
118#    maps each value name to a list of aliases.
119#    Empty for other types of properties.
120_properties = {}
121
122# Dictionary of binary-property values which we store as False/True.
123# Same as the values dictionary of one of the binary properties.
124_binary_values = {}
125
126# Dictionary of null values.
127# Keyed by short property names.
128# These are type-specific values for properties that occur in the data.
129# They are overridden by _defaults, block and code point properties.
130_null_values = {}
131
132# Property value names for null values.
133# We do not store these in _defaults.
134_null_names = frozenset(("<none>", "NaN"))
135
136# Dictionary of explicit default property values.
137# Keyed by short property names.
138_defaults = {"gc": "Cn"}
139
140# _null_values overridden by explicit _defaults.
141# Initialized after parsing is done.
142_null_or_defaults = {}
143
144# List of properties with an ICU UProperty enum.
145# Each item is an (enum, pname, values) tuple.
146# - enum: the ICU enum UProperty constant string
147# - pname: the UCD short property name
148# - values: list of (enum, vname) pairs per property value
149#   - enum: the ICU property value's enum constant string
150#   - vname: the UCD short property value name
151_icu_properties = []
152
153# Dictionary of short property names mapped to _icu_properties items.
154_pname_to_icu_prop = {}
155
156_non_alnum_re = re.compile("[^a-zA-Z0-9]")
157
158def NormPropName(pname):
159  """Returns a normalized form of pname.
160  Removes non-ASCII-alphanumeric characters and lowercases letters."""
161  return _non_alnum_re.sub("", pname).lower()
162
163
164def GetProperty(pname):
165  """Returns the _properties value for the pname.
166  Returns null if the property is ignored.
167  Caches alternate spellings of the property name."""
168  # Try the input name.
169  prop = _properties.get(pname)
170  if prop != None: return prop
171  if pname in _ignored_properties: return None
172  # Try the normalized input name.
173  norm_name = NormPropName(pname)
174  prop = _properties.get(norm_name)
175  if prop != None:
176    _properties[pname] = prop  # Cache prop under this new name spelling.
177    return prop
178  elif pname in _ignored_properties:
179    _ignored_properties.add(pname)  # Remember to ignore this new name spelling.
180    return None
181  else:
182    raise NameError("unknown property %s\n" % pname)
183
184
185def GetShortPropertyName(pname):
186  if pname in _null_values: return pname  # pname is already the short name.
187  prop = GetProperty(pname)
188  if not prop: return ""  # For ignored properties.
189  return prop[1][0] or prop[1][1]  # Long name if no short name.
190
191
192def GetShortPropertyValueName(prop, vname):
193  if vname in prop[2]: return vname
194  values = prop[3]
195  aliases = values.get(vname)
196  if aliases == None:
197    norm_name = NormPropName(vname)
198    aliases = values.get(norm_name)
199    if aliases == None:
200      raise NameError("unknown value name %s for property %s\n" %
201                      (vname, prop[1][0]))
202    values[vname] = aliases
203  return aliases[0] or aliases[1]  # Long name if no short name.
204
205
206def NormalizePropertyValue(prop, vname):
207  if prop[2]:  # Binary/Catalog/Enumerated property.
208    value = GetShortPropertyValueName(prop, vname)
209    if prop[0] == "Binary":
210      value = value == "Y"
211    if prop[1][0].endswith("ccc"):
212      value = int(value)
213  else:
214    value = vname
215  return value
216
217# Character data ----------------------------------------------------------- ***
218
219# Lists of NamesList h1 and h2 headings.
220# Each h1 value is a (start, end, comment) tuple.
221# Each h2 value is a (cp, comment) tuple.
222_h1 = []
223_h2 = []
224
225# List of Unicode blocks.
226# Each item is a tuple of start & end code point integers
227# and a dictionary of default property values.
228_blocks = []
229
230# List of ranges with algorithmic names.
231# Each value is a list of [start, end, type, prefix]
232# where prefix is optional.
233_alg_names_ranges = []
234
235# List of Unicode character ranges and their properties,
236# stored as an inversion map with range_start & props dictionary.
237# Starts with one range for all of Unicode without any properties.
238# Setting values subdivides ranges.
239_starts = array.array('l', [0, 0x110000])  # array of int32_t
240_props = [{}, {}]  # props for 0 and 110000
241
242def FindRange(x):
243  """ Binary search for x in the inversion map.
244  Returns the smallest i where x < _starts[i]"""
245  return bisect.bisect(_starts, x) - 1
246
247
248def GetProps(c):
249  i = FindRange(c)
250  return _props[i]
251
252
253def UpdateProps(start, end, update):
254  assert 0 <= start <= end <= 0x10ffff
255  (need_to_update, do_update, u) = (update[0], update[1], update[2])
256  # Find the index i of the range in _starts that contains start.
257  i = FindRange(start)
258  limit = end + 1
259  # Intersect [start, limit[ with ranges in _starts.
260  c_start = _starts[i]
261  c_limit = _starts[i + 1]
262  c_props = _props[i]
263  # c_start <= start < c_limit
264  if c_start < start:
265    update_limit = c_limit if c_limit <= limit else limit
266    if need_to_update(u, start, update_limit - 1, c_props):
267      # Split off [c_start, start[ with a copy of c_props.
268      i += 1
269      c_props = c_props.copy()
270      _starts.insert(i, start)
271      _props.insert(i, c_props)
272      c_start = start
273  # Modify all ranges that are fully inside [start, limit[.
274  while c_limit <= limit:
275    # start <= c_start < c_limit <= limit
276    if need_to_update(u, c_start, c_limit - 1, c_props):
277      do_update(u, c_start, c_limit - 1, c_props)
278    if c_limit == 0x110000: return
279    i += 1
280    c_start = c_limit
281    c_limit = _starts[i + 1]
282    c_props = _props[i]
283  if c_start < limit and need_to_update(u, c_start, limit - 1, c_props):
284    # Split off [limit, c_limit[ with a copy of c_props.
285    _starts.insert(i + 1, limit)
286    _props.insert(i + 1, c_props.copy())
287    # Modify [c_start, limit[ c_props.
288    do_update(u, c_start, limit - 1, c_props)
289
290
291def NeedToSetProps(props, start, end, c_props):
292  """Returns True if props is not a sub-dict of c_props."""
293  for (pname, value) in props.items():
294    if pname not in c_props or value != c_props[pname]: return True
295  return False
296
297
298def DoSetProps(props, start, end, c_props):
299  c_props.update(props)
300
301
302def SetProps(start, end, props):
303  UpdateProps(start, end, (NeedToSetProps, DoSetProps, props))
304
305
306def NeedToSetAlways(nv, start, end, c_props):
307  return True
308
309
310# For restoring boundaries after merging adjacent same-props ranges.
311def AddBoundary(x):
312  """Ensure that there is a range start/limit at x."""
313  assert 0 <= x <= 0x10ffff
314  i = FindRange(x)
315  if _starts[i] == x: return
316  # Split the range at x.
317  c_start = _starts[i]
318  c_limit = _starts[i + 1]
319  c_props = _props[i]
320  # c_start < x < c_limit
321  i += 1
322  _starts.insert(i, x)
323  _props.insert(i, c_props.copy())
324
325
326def SetDefaultValue(pname, value):
327  """Sets the property's default value. Ignores null values."""
328  prop = GetProperty(pname)
329  if prop and value not in _null_names:
330    value = NormalizePropertyValue(prop, value)
331    if value != _null_values[prop[1][0]]:
332      _defaults[prop[1][0]] = value
333      SetProps(0, 0x10ffff, {prop[1][0]: value})
334
335
336def SetBinaryPropertyToTrue(pname, start, end):
337  prop = GetProperty(pname)
338  if prop:
339    assert prop[0] == "Binary"
340    SetProps(start, end, {prop[1][0]: True})
341
342
343def SetPropValue(prop, vname, start, end):
344  value = NormalizePropertyValue(prop, vname)
345  SetProps(start, end, {prop[1][0]: value})
346
347
348def SetPropertyValue(pname, vname, start, end):
349  prop = GetProperty(pname)
350  if prop: SetPropValue(prop, vname, start, end)
351
352# Parsing ------------------------------------------------------------------ ***
353
354_stripped_cp_re = re.compile("([0-9a-fA-F]+)$")
355_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$")
356_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$")
357
358def ReadUCDLines(in_file, want_ranges=True, want_other=False,
359                 want_comments=False, want_missing=False):
360  """Parses lines from a semicolon-delimited UCD text file.
361  Strips comments, ignores empty and all-comment lines.
362  Returns a tuple (type, line, ...).
363  """
364  for line in in_file:
365    line = line.strip()
366    if not line: continue
367    if line.startswith("#"):  # whole-line comment
368      if want_missing:
369        match = _missing_re.match(line)
370        if match:
371          fields = match.group(1).split(";")
372          for i in range(len(fields)): fields[i] = fields[i].strip()
373          yield ("missing", line, fields)
374          continue
375      if want_comments: yield ("comment", line)
376      continue
377    comment_start = line.find("#")  # inline comment
378    if comment_start >= 0:
379      line = line[:comment_start].rstrip()
380      if not line: continue
381    fields = line.split(";")
382    for i in range(len(fields)): fields[i] = fields[i].strip()
383    if want_ranges:
384      first = fields[0]
385      match = _stripped_range_re.match(first)
386      if match:
387        start = int(match.group(1), 16)
388        end = int(match.group(2), 16)
389        yield ("range", line, start, end, fields)
390        continue
391      match = _stripped_cp_re.match(first)
392      if match:
393        c = int(match.group(1), 16)
394        yield ("range", line, c, c, fields)
395        continue
396    if want_other:
397      yield ("other", line, fields)
398    else:
399      raise SyntaxError("unable to parse line\n  %s\n" % line)
400
401
402def AddBinaryProperty(short_name, long_name):
403  _null_values[short_name] = False
404  bin_prop = _properties["Math"]
405  prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
406  _properties[short_name] = prop
407  _properties[long_name] = prop
408  _properties[NormPropName(short_name)] = prop
409  _properties[NormPropName(long_name)] = prop
410
411
412def AddPOSIXBinaryProperty(name):
413  # We only define a long name for ICU-specific (non-UCD) POSIX properties.
414  _null_values[name] = False
415  bin_prop = _properties["Math"]
416  prop = ("Binary", ["", name], bin_prop[2], bin_prop[3])
417  _properties[name] = prop
418  _properties[NormPropName(name)] = prop
419  # This is to match UProperty UCHAR_POSIX_ALNUM etc.
420  _properties["posix" + NormPropName(name)] = prop
421
422
423# Match a comment line like
424# PropertyAliases-6.1.0.txt
425# and extract the Unicode version.
426_ucd_version_re = re.compile("# *PropertyAliases" +
427                             "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" +
428                             "\\.txt")
429
430def ParsePropertyAliases(in_file):
431  global _ucd_version
432  prop_type_nulls = {
433    "Binary": False,
434    "Catalog": "??",  # Must be specified, e.g., in @missing line.
435    "Enumerated": "??",  # Must be specified.
436    "Numeric": "NaN",
437    "String": "",
438    "Miscellaneous": ""
439  }
440  for data in ReadUCDLines(in_file, want_ranges=False,
441                           want_other=True, want_comments=True):
442    if data[0] == "comment":
443      line = data[1]
444      match = _ucd_version_re.match(line)
445      if match:
446        _ucd_version = match.group(1)
447      else:
448        words = line[1:].lstrip().split()
449        if len(words) == 2 and words[1] == "Properties":
450          prop_type = words[0]
451          null_value = prop_type_nulls[prop_type]
452    else:
453      # type == "other"
454      aliases = data[2]
455      name = aliases[0]
456      if name in _ignored_properties:
457        for alias in aliases:
458          _ignored_properties.add(alias)
459          _ignored_properties.add(NormPropName(alias))
460      else:
461        if name.endswith("ccc"):
462          _null_values[name] = 0
463        else:
464          _null_values[name] = null_value
465        prop = (prop_type, aliases, set(), {})
466        for alias in aliases:
467          _properties[alias] = prop
468          _properties[NormPropName(alias)] = prop
469  # Add provisional and ICU-specific properties we need.
470  # We add some in support of runtime API, even if we do not write
471  # data for them to ppucd.txt (e.g., lccc & tccc).
472  # We add others just to represent UCD data that contributes to
473  # some functionality, although Unicode has not "blessed" them
474  # as separate properties (e.g., Turkic_Case_Folding).
475
476  # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
477  name = "Turkic_Case_Folding"
478  _null_values[name] = ""
479  prop = ("String", [name, name], set(), {})
480  _properties[name] = prop
481  _properties[NormPropName(name)] = prop
482  # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
483  name = "Conditional_Case_Mappings"
484  _null_values[name] = ""
485  prop = ("Miscellaneous", [name, name], set(), {})
486  _properties[name] = prop
487  _properties[NormPropName(name)] = prop
488  # lccc = ccc of first cp in canonical decomposition.
489  _null_values["lccc"] = 0
490  ccc_prop = list(_properties["ccc"])
491  ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"]
492  prop = tuple(ccc_prop)
493  _properties["lccc"] = prop
494  _properties["Lead_Canonical_Combining_Class"] = prop
495  _properties["leadcanonicalcombiningclass"] = prop
496  # tccc = ccc of last cp in canonical decomposition.
497  _null_values["tccc"] = 0
498  ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"]
499  prop = tuple(ccc_prop)
500  _properties["tccc"] = prop
501  _properties["Trail_Canonical_Combining_Class"] = prop
502  _properties["trailcanonicalcombiningclass"] = prop
503  # Script_Extensions
504  if "scx" not in _properties:
505    _null_values["scx"] = ""
506    prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
507    _properties["scx"] = prop
508    _properties["Script_Extensions"] = prop
509    _properties["scriptextensions"] = prop
510  # General Category as a bit mask.
511  _null_values["gcm"] = "??"
512  gc_prop = _properties["gc"]
513  prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
514  _properties["gcm"] = prop
515  _properties["General_Category_Mask"] = prop
516  _properties["generalcategorymask"] = prop
517  # Various binary properties.
518  AddBinaryProperty("Sensitive", "Case_Sensitive")
519  AddBinaryProperty("nfdinert", "NFD_Inert")
520  AddBinaryProperty("nfkdinert", "NFKD_Inert")
521  AddBinaryProperty("nfcinert", "NFC_Inert")
522  AddBinaryProperty("nfkcinert", "NFKC_Inert")
523  AddBinaryProperty("segstart", "Segment_Starter")
524  # http://www.unicode.org/reports/tr51/#Emoji_Properties
525  AddBinaryProperty("Emoji", "Emoji")
526  AddBinaryProperty("EPres", "Emoji_Presentation")
527  AddBinaryProperty("EMod", "Emoji_Modifier")
528  AddBinaryProperty("EBase", "Emoji_Modifier_Base")
529  AddBinaryProperty("EComp", "Emoji_Component")
530  AddBinaryProperty("ExtPict", "Extended_Pictographic")
531  # C/POSIX character classes that do not have Unicode property [value] aliases.
532  # See uchar.h.
533  AddPOSIXBinaryProperty("alnum")
534  AddPOSIXBinaryProperty("blank")
535  AddPOSIXBinaryProperty("graph")
536  AddPOSIXBinaryProperty("print")
537  AddPOSIXBinaryProperty("xdigit")
538
539
540def ParsePropertyValueAliases(in_file):
541  global _binary_values
542  for data in ReadUCDLines(in_file, want_ranges=False,
543                           want_other=True, want_missing=True):
544    if data[0] == "missing":
545      SetDefaultValue(data[2][0], data[2][1])
546    else:
547      # type == "other"
548      fields = data[2]
549      pname = fields[0]
550      prop = GetProperty(pname)
551      if prop:
552        del fields[0]  # Only the list of aliases remains.
553        short_name = fields[0]
554        if short_name == "n/a":  # no short name
555          fields[0] = ""
556          short_name = fields[1]
557        prop[2].add(short_name)
558        values = prop[3]
559        for alias in fields:
560          if alias:
561            values[alias] = fields
562            values[NormPropName(alias)] = fields
563        if prop[0] == "Binary" and not _binary_values:
564          _binary_values = values
565  # Some of the @missing lines with non-null default property values
566  # are in files that we do not parse;
567  # either because the data for that property is easily
568  # (i.e., the @missing line would be the only reason to parse such a file)
569  # or because we compute the property at runtime,
570  # such as the Hangul_Syllable_Type.
571  if "dt" not in _defaults:  # DerivedDecompositionType.txt
572    _defaults["dt"] = "None"
573  if "nt" not in _defaults:  # DerivedNumericType.txt
574    _defaults["nt"] = "None"
575  if "hst" not in _defaults:  # HangulSyllableType.txt
576    _defaults["hst"] = "NA"
577  if "gc" not in _defaults:  # No @missing line in any .txt file?
578    _defaults["gc"] = "Cn"
579  # Copy the gc default value to gcm.
580  _defaults["gcm"] = _defaults["gc"]
581  # Add ISO 15924-only script codes.
582  # Only for the ICU script code API, not necessary for parsing the UCD.
583  script_prop = _properties["sc"]
584  short_script_names = script_prop[2]  # set
585  script_values = script_prop[3]  # dict
586  remove_scripts = []
587  for script in _scripts_only_in_iso15924:
588    if script in short_script_names:
589      remove_scripts.append(script)
590    else:
591      short_script_names.add(script)
592      # Do not invent a Unicode long script name before the UCD adds the script.
593      script_list = [script, script]  # [short, long]
594      script_values[script] = script_list
595      # Probably not necessary because
596      # we will not parse these scripts from the UCD:
597      script_values[NormPropName(script)] = script_list
598  if remove_scripts:
599    raise ValueError(
600        "remove %s from _scripts_only_in_iso15924" % remove_scripts)
601
602
603def ParseBlocks(in_file):
604  for data in ReadUCDLines(in_file, want_missing=True):
605    if data[0] == "missing":
606      SetDefaultValue("blk", data[2][0])
607    else:
608      # type == "range"
609      (start, end, name) = (data[2], data[3], data[4][1])
610      _blocks.append((start, end, {"blk": name}))
611      SetPropertyValue("blk", name, start, end)
612  _blocks.sort()
613  # Check for overlapping blocks.
614  prev_end = -1
615  for b in _blocks:
616    start = b[0]
617    end = b[1]
618    if prev_end >= start:
619      raise ValueError(
620          "block %04lX..%04lX %s overlaps with another " +
621          "ending at %04lX\n  %s\n" %
622          (start, end, b[2]["blk"], prev_end))
623    prev_end = end
624
625
626def ParseUnicodeData(in_file):
627  dt_prop = GetProperty("dt")
628  range_first_line = ""
629  range_first = -1
630  for data in ReadUCDLines(in_file, want_missing=True):
631    # type == "range"
632    (line, c, end, fields) = (data[1], data[2], data[3], data[4])
633    assert c == end
634    name = fields[1]
635    if name.startswith("<"):
636      if name.endswith(", First>"):
637        if range_first >= 0:
638          raise SyntaxError(
639              "error: unterminated range started at\n  %s\n" %
640              range_first_line)
641        range_first = c
642        range_first_line = line
643        continue
644      elif name.endswith(", Last>"):
645        if range_first < 0:
646          raise SyntaxError(
647              "error: range end without start at\n  %s\n" %
648              line)
649        elif range_first > c:
650          raise SyntaxError(
651              "error: range start/end out of order at\n  %s\n  %s\n" %
652              (range_first_line, line))
653        first_name = range_first_line.split(";")[1][1:-8]
654        name = name[1:-7]
655        if first_name != name:
656          raise SyntaxError(
657              "error: range start/end name mismatch at\n  %s\n  %s\n" %
658              (range_first_line, line))
659        end = c
660        c = range_first
661        range_first = -1
662        # Remember algorithmic name ranges.
663        if "Ideograph" in name:
664          prefix = "CJK UNIFIED IDEOGRAPH-"
665          if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-"
666          _alg_names_ranges.append([c, end, "han", prefix])
667        elif name == "Hangul Syllable":
668          _alg_names_ranges.append([c, end, "hangul"])
669        name = ""
670      else:
671        # Ignore non-names like <control>.
672        name = ""
673    props = {}
674    if name: props["na"] = name
675    props["gc"] = fields[2]
676    ccc = int(fields[3])
677    if ccc: props["ccc"] = ccc
678    props["bc"] = fields[4]
679    # Decomposition type & mapping.
680    dm = fields[5]
681    if dm:
682      if dm.startswith("<"):
683        dt_limit = dm.index(">")
684        dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit])
685        dm = dm[dt_limit + 1:].lstrip()
686      else:
687        dt = "Can"
688      props["dt"] = dt
689      props["dm"] = dm
690    # Numeric type & value.
691    decimal = fields[6]
692    digit = fields[7]
693    nv = fields[8]
694    if (decimal and decimal != nv) or (digit and digit != nv):
695      raise SyntaxError("error: numeric values differ at\n  %s\n" % line)
696    if nv:
697      # Map improper fractions to proper ones.
698      # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
699      # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
700      if nv == "2/12":
701        nv = "1/6"
702      elif nv == "3/12":
703        nv = "1/4"
704      elif nv == "4/12":
705        nv = "1/3"
706      elif nv == "6/12":
707        nv = "1/2"
708      elif nv == "8/12":
709        nv = "2/3"
710      elif nv == "9/12":
711        nv = "3/4"
712      elif nv == "10/12":
713        nv = "5/6"
714      props["nv"] = nv
715      props["nt"] = "De" if decimal else "Di" if digit else "Nu"
716    if fields[9] == "Y": props["Bidi_M"] = True
717    # ICU 49 and above does not support Unicode_1_Name any more.
718    # See ticket #9013.
719    # na1 = fields[10]
720    # if na1: props["na1"] = na1
721    # ISO_Comment is deprecated and has no values.
722    # isc = fields[11]
723    # if isc: props["isc"] = isc
724    # Simple case mappings.
725    suc = fields[12]
726    slc = fields[13]
727    stc = fields[14]
728    if suc: props["suc"] = suc
729    if slc: props["slc"] = slc
730    if stc: props["stc"] = stc
731    SetProps(c, end, props)
732  if range_first >= 0:
733    raise SyntaxError(
734        "error: unterminated range started at\n  %s\n" %
735        range_first_line)
736  # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
737  SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
738  _alg_names_ranges.sort()
739
740
741_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$")
742_names_h2_re = re.compile("@\t\t(.+)")
743_names_char_re = re.compile("([0-9a-fA-F]+)\t.+")
744
745def ParseNamesList(in_file):
746  pending_h2 = ""
747  for line in in_file:
748    line = line.strip()
749    if not line: continue
750    match = _names_h1_re.match(line)
751    if match:
752      pending_h2 = ""  # Drop a pending h2 when we get to an h1.
753      start = int(match.group(1), 16)
754      end = int(match.group(3), 16)
755      comment = match.group(2).replace(u"\xa0", " ")
756      _h1.append((start, end, comment))
757      continue
758    match = _names_h2_re.match(line)
759    if match:
760      pending_h2 = match.group(1).replace(u"\xa0", " ")
761      continue
762    if pending_h2:
763      match = _names_char_re.match(line)
764      if match:
765        c = int(match.group(1), 16)
766        _h2.append((c, pending_h2))
767        pending_h2 = ""
768  _h1.sort()
769  _h2.sort()
770
771
772def ParseNamedProperties(in_file):
773  """Parses a .txt file where the first column is a code point range
774  and the second column is a property name.
775  Sets binary properties to True,
776  and other properties to the values in the third column."""
777  for data in ReadUCDLines(in_file, want_missing=True):
778    if data[0] == "missing":
779      SetDefaultValue(data[2][0], data[2][1])
780    else:
781      # type == "range"
782      if len(data[4]) == 2:
783        SetBinaryPropertyToTrue(data[4][1], data[2], data[3])
784      else:
785        SetPropertyValue(data[4][1], data[4][2], data[2], data[3])
786
787
788def ParseOneProperty(in_file, pname):
789  """Parses a .txt file where the first column is a code point range
790  and the second column is the value of a known property."""
791  prop = GetProperty(pname)
792  for data in ReadUCDLines(in_file, want_missing=True):
793    if data[0] == "missing":
794      SetDefaultValue(pname, data[2][0])
795    else:
796      # type == "range"
797      SetPropValue(prop, data[4][1], data[2], data[3])
798
799
800def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg")
801def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age")
802def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc")
803def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
804def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
805def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
806def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
807def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
808def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
809def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
810def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
811def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx")
812def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB")
813def ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo")
814def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB")
815
816
817def DoSetNameAlias(alias, start, end, c_props):
818  if "Name_Alias" in c_props:
819    c_props["Name_Alias"] += ',' + alias
820  else:
821    c_props["Name_Alias"] = alias
822
823
824def ParseNameAliases(in_file):
825  """Parses Name_Alias from NameAliases.txt.
826  A character can have multiple aliases.
827
828  In Unicode 6.0, there are two columns,
829  with a name correction in the second column.
830
831  In Unicode 6.1, there are three columns.
832  The second contains an alias, the third its type.
833  The documented types are:
834    correction, control, alternate, figment, abbreviation
835
836  This function does not sort the types, assuming they appear in this order."""
837  for data in ReadUCDLines(in_file):
838    start = data[2]
839    end = data[3]
840    if start != end:
841      raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" %
842                       (start, end))
843    fields = data[4]
844    if len(fields) == 2:
845      alias = "correction=" + fields[1]
846    else:
847      alias = fields[2] + '=' + fields[1]
848    update = (NeedToSetAlways, DoSetNameAlias, alias)
849    UpdateProps(start, end, update)
850
851
852def NeedToSetNumericValue(nv, start, end, c_props):
853  c_nv = c_props.get("nv")
854  if c_nv == None:
855    # DerivedNumericValues.txt adds a Numeric_Value.
856    assert "nt" not in c_props
857    return True
858  if nv != c_nv:
859    raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
860                     "but DerivedNumericValues.txt has nv=%s") %
861                     (c_nv, start, end, nv))
862  return False
863
864
865def DoSetNumericValue(nv, start, end, c_props):
866  c_props.update({"nt": "Nu", "nv": nv})
867
868
869def ParseDerivedNumericValues(in_file):
870  """Parses DerivedNumericValues.txt.
871  For most characters, the numeric type & value were parsed previously
872  from UnicodeData.txt but that does not show the values for Han characters.
873  Here we check that values match those from UnicodeData.txt
874  and add new ones."""
875  # Ignore the @missing line which has an incorrect number of fields,
876  # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1).
877  # Also, "NaN" is just the Numeric null value anyway.
878  for data in ReadUCDLines(in_file):
879    # Conditional update to the numeric value in the 4th field.
880    update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3])
881    UpdateProps(data[2], data[3], update)
882
883
884def ParseCaseFolding(in_file):
885  for data in ReadUCDLines(in_file, want_missing=True):
886    if data[0] == "missing":
887      assert data[2][0] == "C"  # common to scf & cf
888      SetDefaultValue("scf", data[2][1])
889      SetDefaultValue("cf", data[2][1])
890    else:
891      # type == "range"
892      start = data[2]
893      end = data[3]
894      status = data[4][1]
895      mapping = data[4][2]
896      assert status in "CSFT"
897      if status == "C":
898        SetProps(start, end, {"scf": mapping, "cf": mapping})
899      elif status == "S":
900        SetPropertyValue("scf", mapping, start, end)
901      elif status == "F":
902        SetPropertyValue("cf", mapping, start, end)
903      else:  # status == "T"
904        SetPropertyValue("Turkic_Case_Folding", mapping, start, end)
905
906
907def DoSetConditionalCaseMappings(ccm, start, end, c_props):
908  if "Conditional_Case_Mappings" in c_props:
909    c_props["Conditional_Case_Mappings"] += ',' + ccm
910  else:
911    c_props["Conditional_Case_Mappings"] = ccm
912
913
914def ParseSpecialCasing(in_file):
915  for data in ReadUCDLines(in_file, want_missing=True):
916    if data[0] == "missing":
917      SetDefaultValue("lc", data[2][0])
918      SetDefaultValue("tc", data[2][1])
919      SetDefaultValue("uc", data[2][2])
920    else:
921      # type == "range"
922      start = data[2]
923      end = data[3]
924      fields = data[4]
925      if len(fields) < 5 or not fields[4]:
926        # Unconditional mappings.
927        SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]})
928      else:
929        # Conditional_Case_Mappings
930        ccm = (fields[4] + ":lc=" + fields[1] +
931               "&tc=" + fields[2] + "&uc=" + fields[3])
932        update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm)
933        UpdateProps(start, end, update)
934
935
936def ParseBidiBrackets(in_file):
937  for data in ReadUCDLines(in_file, want_missing=True):
938    if data[0] == "missing":
939      SetDefaultValue("bpt", data[2][1])
940    else:
941      # type == "range"
942      start = data[2]
943      end = data[3]
944      assert start == end
945      mapping = data[4][1]
946      bracket_type = data[4][2]
947      SetProps(start, end, {"bpb": mapping, "bpt": bracket_type})
948
949# Postprocessing ----------------------------------------------------------- ***
950
951def PrintedSize(pname, value):
952  if isinstance(value, bool):
953    if value:
954      return len(pname) + 1  # ";pname"
955    else:
956      return len(pname) + 2  # ";-pname"
957  else:
958    return len(pname) + len(str(value)) + 2  # ";pname=value"
959
960
961def CompactBlock(b, i):
962  assert b[0] == _starts[i]
963  b_props = b[2]  # Normally just blk from Blocks.txt.
964  # b_props["blk"] has not been canonicalized yet.
965  b_props["blk"] = _props[i]["blk"]
966  orig_i = i
967  # Count the number of occurrences of each property's value in this block.
968  # To minimize the output, count the number of assigned ranges,
969  # not the number of code points.
970  num_ranges = 0
971  prop_counters = {}
972  if "gc" in b_props:
973    b_is_unassigned = b_props["gc"] == "Cn"  # Unreachable with normal data.
974  else:
975    b_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
976  while True:
977    start = _starts[i]
978    if start > b[1]: break
979    props = _props[i]
980    if "gc" in props:
981      is_unassigned = props["gc"] == "Cn"
982    else:
983      is_unassigned = b_is_unassigned
984    if is_unassigned:
985      # Compact an unassigned range inside the block and
986      # mark it to be written with "unassigned".
987      # It falls back to default properties, not block properties,
988      # except for the blk=Block property.
989      assert props["blk"] == b_props["blk"]
990      del props["blk"]
991      for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
992        if props[pname] == _null_or_defaults[pname]: del props[pname]
993      # What remains are unusual default values for unassigned code points.
994      # For example, bc=R or lb=ID.
995      # See http://www.unicode.org/reports/tr44/#Default_Values_Table
996      props["unassigned"] = True
997    else:
998      for (pname, value) in props.items():
999        if pname in prop_counters:
1000          counter = prop_counters[pname]
1001        else:
1002          counter = {_null_or_defaults[pname]: num_ranges}
1003          prop_counters[pname] = counter
1004        if value in counter:
1005          counter[value] += 1
1006        else:
1007          counter[value] = 1
1008      # Also count default values for properties that do not occur in a range.
1009      for pname in prop_counters:
1010        if pname not in props:
1011          counter = prop_counters[pname]
1012          value = _null_or_defaults[pname]
1013          counter[value] += 1
1014      num_ranges += 1
1015      # Invariant: For each counter, the sum of counts must equal num_ranges.
1016    i += 1
1017  # For each property that occurs within this block,
1018  # set the value that reduces the file size the most as a block property value.
1019  # This is usually the most common value.
1020  for (pname, counter) in prop_counters.items():
1021    default_value = _null_or_defaults[pname]
1022    default_size = PrintedSize(pname, default_value) * counter[default_value]
1023    max_value = None
1024    max_count = 0
1025    max_savings = 0
1026    for (value, count) in counter.items():
1027      if value != default_value and count > 1:
1028        # Does the file get smaller by setting the block default?
1029        # We save writing the block value as often as it occurs,
1030        # minus once for writing it for the block,
1031        # minus writing the default value instead.
1032        savings = PrintedSize(pname, value) * (count - 1) - default_size
1033        if savings > max_savings:
1034          max_value = value
1035          max_count = count
1036          max_savings = savings
1037    # Do not compress uncompressible properties,
1038    # with an exception for many empty-string values in a block
1039    # (NFCK_CF='' for tags and variation selectors).
1040    if (max_savings > 0 and
1041        ((pname not in _uncompressible_props) or
1042          (max_value == '' and max_count >= 12))):
1043      b_props[pname] = max_value
1044  # For each range and property, remove the default+block value
1045  # but set the default value if that property was not set
1046  # (i.e., it used to inherit the default value).
1047  b_defaults = _null_or_defaults.copy()
1048  b_defaults.update(b_props)
1049  i = orig_i
1050  while True:
1051    start = _starts[i]
1052    if start > b[1]: break
1053    props = _props[i]
1054    if "unassigned" not in props:
1055      # Compact an assigned range inside the block.
1056      for pname in prop_counters:
1057        if pname in props:
1058          if props[pname] == b_defaults[pname]: del props[pname]
1059        elif pname in b_props:
1060          # b_props only has non-default values.
1061          # Set the default value if it used to be inherited.
1062          props[pname] = _null_or_defaults[pname]
1063      # If there is only one assigned range, then move all of its properties
1064      # to the block.
1065      if num_ranges == 1:
1066        b_props.update(props)
1067        props.clear()
1068    i += 1
1069  # Return the _starts index of the first range after this block.
1070  return i
1071
1072
1073def CompactNonBlock(limit, i):
1074  """Remove default property values from between-block ranges."""
1075  default_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
1076  while True:
1077    start = _starts[i]
1078    if start >= limit: break
1079    props = _props[i]
1080    if "gc" in props:
1081      is_unassigned = props["gc"] == "Cn"
1082    else:
1083      is_unassigned = default_is_unassigned
1084    for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
1085      if props[pname] == _null_or_defaults[pname]: del props[pname]
1086    assert "blk" not in props
1087    # If there are no props left, then nothing will be printed.
1088    # Otherwise, add "unassigned" for more obvious output.
1089    if props and is_unassigned:
1090      props["unassigned"] = True
1091    i += 1
1092  # Return the _starts index of the first range after this block.
1093  return i
1094
1095
1096def CompactBlocks():
1097  """Optimizes block properties.
1098  Sets properties on blocks to the most commonly used values,
1099  and removes default+block values from code point properties."""
1100  # Ensure that there is a boundary in _starts for each block
1101  # so that the simple mixing method below works.
1102  for b in _blocks:
1103    AddBoundary(b[0])
1104    limit = b[1] + 1
1105    if limit <= 0x10ffff: AddBoundary(limit)
1106  # Walk through ranges and blocks together.
1107  i = 0
1108  for b in _blocks:
1109    b_start = b[0]
1110    if _starts[i] < b_start:
1111      i = CompactNonBlock(b_start, i)
1112    i = CompactBlock(b, i)
1113  CompactNonBlock(0x110000, i)
1114
1115# Output ------------------------------------------------------------------- ***
1116
1117def AppendRange(fields, start, end):
1118  if start == end:
1119    fields.append("%04lX" % start)
1120  else:
1121    fields.append("%04lX..%04lX" % (start, end))
1122
1123
1124def AppendProps(fields, props):
1125  # Sort property names (props keys) by their normalized forms
1126  # and output properties in that order.
1127  for pname in sorted(props, key=NormPropName):
1128    value = props[pname]
1129    if isinstance(value, bool):
1130      if not value: pname = "-" + pname
1131      fields.append(pname)
1132    else:
1133      fields.append("%s=%s" % (pname, value))
1134
1135
1136def WriteFieldsRangeProps(fields, start, end, props, out_file):
1137  AppendRange(fields, start, end)
1138  AppendProps(fields, props)
1139  out_file.write(";".join(fields))
1140  out_file.write("\n")
1141
1142
1143def EscapeNonASCII(s):
1144  i = 0
1145  while i < len(s):
1146    c = ord(s[i])
1147    if c <= 0x7f:
1148      i = i + 1
1149    else:
1150      if c <= 0xffff:
1151        esc = u"\\u%04X" % c
1152      else:
1153        esc = u"\\U%08X" % c
1154      s = s[:i] + esc + s[i+1:]
1155      i = i + len(esc)
1156  return s
1157
1158
1159def WritePreparsedUCD(out_file):
1160  out_file.write("""# Preparsed UCD generated by ICU preparseucd.py
1161# Copyright (C) 1991 and later: Unicode, Inc. and others.
1162# License & terms of use: http://www.unicode.org/copyright.html
1163""");
1164  out_file.write("ucd;%s\n\n" % _ucd_version)
1165  # Sort property names (props keys) by their normalized forms
1166  # and output properties in that order.
1167  pnames = sorted(_null_values, key=NormPropName)
1168  for pname in pnames:
1169    prop = _properties[pname]
1170    out_file.write(";".join(["property", prop[0]] + prop[1]))
1171    out_file.write("\n")
1172  out_file.write("\n")
1173  out_file.write(";".join(["binary"] + _binary_values["N"]))
1174  out_file.write("\n")
1175  out_file.write(";".join(["binary"] + _binary_values["Y"]))
1176  out_file.write("\n")
1177  for pname in pnames:
1178    prop = _properties[pname]
1179    short_names = prop[2]
1180    if short_names and prop[0] != "Binary":
1181      for name in sorted(short_names):
1182        out_file.write(";".join(["value", prop[1][0]] + prop[3][name]))
1183        out_file.write("\n")
1184  out_file.write("\n")
1185  # Ensure that there is a boundary in _starts for each
1186  # range of data we mix into the output,
1187  # so that the simple mixing method below works.
1188  for b in _blocks: AddBoundary(b[0])
1189  for r in _alg_names_ranges: AddBoundary(r[0])
1190  for h in _h1: AddBoundary(h[0])
1191  for h in _h2: AddBoundary(h[0])
1192  # Write the preparsed data. ppucd.txt = preparsed UCD
1193  # Syntax: http://site.icu-project.org/design/props/ppucd
1194  WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file)
1195  i_blocks = 0
1196  i_alg = 0
1197  i_h1 = 0
1198  i_h2 = 0
1199  b_end = -1
1200  for i in range(len(_starts) - 1):
1201    start = _starts[i]
1202    end = _starts[i + 1] - 1
1203    # Block with default properties.
1204    if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]:
1205      b = _blocks[i_blocks]
1206      b_end = b[1]
1207      WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file)
1208      i_blocks += 1
1209    # NamesList h1 heading (for [most of] a block).
1210    if i_h1 < len(_h1) and start == _h1[i_h1][0]:
1211      h = _h1[i_h1]
1212      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
1213      i_h1 += 1
1214    # Algorithmic-names range.
1215    if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
1216      r = _alg_names_ranges[i_alg]
1217      fields = ["algnamesrange"]
1218      AppendRange(fields, r[0], r[1])
1219      fields.extend(r[2:])
1220      out_file.write(";".join(fields))
1221      out_file.write("\n")
1222      i_alg += 1
1223    # NamesList h2 heading.
1224    if i_h2 < len(_h2) and start == _h2[i_h2][0]:
1225      out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
1226      i_h2 += 1
1227    # Code point/range data.
1228    props = _props[i]
1229    # Omit ranges with only default+block properties.
1230    if props:
1231      if start > b_end and b_end >= 0:
1232        # First range with values after the last block.
1233        # Separate it visually from the block lines.
1234        out_file.write("\n# No block\n")
1235        b_end = -1
1236      if "unassigned" in props:
1237        # Do not output "unassigned" as a property.
1238        del props["unassigned"]
1239        line_type = "unassigned"
1240      else:
1241        line_type = "cp"
1242      WriteFieldsRangeProps([line_type], start, end, props, out_file)
1243
1244# Write Normalizer2 input files -------------------------------------------- ***
1245# Ported from gennorm/store.c.
1246
1247def WriteAllCC(out_file):
1248  out_file.write("# Canonical_Combining_Class (ccc) values\n");
1249  prev_start = 0
1250  prev_cc = 0
1251  for i in range(len(_starts)):
1252    start = _starts[i]
1253    props = _props[i]
1254    cc = props.get("ccc")
1255    if not cc: cc = 0
1256    if prev_cc != cc:
1257      if prev_cc != 0:
1258        last_code_point = start - 1
1259        if prev_start == last_code_point:
1260          out_file.write("%04X:%d\n" % (last_code_point, prev_cc))
1261        else:
1262          out_file.write("%04X..%04X:%d\n" %
1263                         (prev_start, last_code_point, prev_cc))
1264      prev_start = start
1265      prev_cc = cc
1266
1267
1268def HasMapping(c):
1269  props = GetProps(c)
1270  dt = props.get("dt")
1271  return dt and dt != "None"
1272
1273
1274def HasOneWayMapping(c):
1275  while True:
1276    props = GetProps(c)
1277    dt = props.get("dt")
1278    if not dt or dt == "None":
1279      return False  # no mapping
1280    elif dt == "Can":
1281      # The canonical decomposition is a one-way mapping if
1282      # - it does not map to exactly two code points
1283      # - c has ccc!=0
1284      # - c has the Composition_Exclusion property
1285      # - its starter has a one-way mapping (loop for this)
1286      # - its non-starter decomposes
1287      nfd = props["dm"].split()
1288      if (len(nfd) != 2 or
1289          props.get("ccc") or
1290          props.get("Comp_Ex") or
1291          HasMapping(int(nfd[1], 16))):
1292        return True
1293      c = int(nfd[0], 16)  # continue
1294    else:
1295      # c has a compatibility mapping.
1296      return True
1297
1298
1299_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others.
1300# License & terms of use: http://www.unicode.org/copyright.html
1301# Copyright (C) 1999-2016, International Business Machines
1302# Corporation and others.  All Rights Reserved.
1303#
1304"""
1305
1306def WriteNorm2NFCTextFile(path):
1307  global _data_file_copyright
1308  with open(os.path.join(path, "nfc.txt"), "w") as out_file:
1309    out_file.write(
1310        _data_file_copyright + """# file name: nfc.txt
1311#
1312# machine-generated by ICU preparseucd.py
1313#
1314# Complete data for Unicode NFC normalization.
1315
1316* Unicode """ + _ucd_version + """
1317
1318""")
1319    WriteAllCC(out_file)
1320    out_file.write("\n# Canonical decomposition mappings\n")
1321    for i in range(len(_starts) - 1):
1322      start = _starts[i]
1323      end = _starts[i + 1] - 1
1324      props = _props[i]
1325      dm = props.get("dm")
1326      if dm and dm[0] != '<' and props["dt"] == "Can":
1327        assert start == end
1328        # The Comp_Ex=Full_Composition_Exclusion property tells us
1329        # whether the canonical decomposition round-trips.
1330        separator = '>' if props.get("Comp_Ex") else '='
1331        out_file.write("%04X%s%s\n" % (start, separator, dm))
1332
1333
1334def WriteNorm2NFKCTextFile(path):
1335  global _data_file_copyright
1336  with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
1337    out_file.write(
1338        _data_file_copyright + """# file name: nfkc.txt
1339#
1340# machine-generated by ICU preparseucd.py
1341#
1342# Data for Unicode NFKC normalization.
1343# This file contains only compatibility decomposition mappings,
1344# plus those canonical decompositions that change from NFC round-trip mappings
1345# to NFKC one-way mappings.
1346# Use this file as the second gennorm2 input file after nfc.txt.
1347
1348* Unicode """ + _ucd_version + """
1349
1350""")
1351    for i in range(len(_starts) - 1):
1352      start = _starts[i]
1353      end = _starts[i + 1] - 1
1354      props = _props[i]
1355      dm = props.get("dm")
1356      if dm and dm[0] != '<':
1357        assert start == end
1358        if props["dt"] != "Can":
1359          # Compatibility decomposition.
1360          out_file.write("%04X>%s\n" % (start, dm))
1361        elif not props.get("Comp_Ex") and HasOneWayMapping(start):
1362          # NFC round-trip mapping turns into NFKC one-way mapping.
1363          out_file.write("%04X>%s  # NFC round-trip, NFKC one-way\n" %
1364                         (start, dm))
1365
1366
1367def WriteNorm2NFKC_CFTextFile(path):
1368  global _data_file_copyright
1369  with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
1370    out_file.write(
1371        _data_file_copyright + """# file name: nfkc_cf.txt
1372#
1373# machine-generated by ICU preparseucd.py
1374#
1375# This file contains the Unicode NFKC_CF mappings,
1376# extracted from the UCD file DerivedNormalizationProps.txt,
1377# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
1378# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
1379
1380""")
1381    out_file.write("* Unicode " + _ucd_version + "\n\n")
1382    prev_start = 0
1383    prev_end = 0
1384    prev_nfkc_cf = None
1385    for i in range(len(_starts) - 1):
1386      start = _starts[i]
1387      end = _starts[i + 1] - 1
1388      props = _props[i]
1389      nfkc_cf = props.get("NFKC_CF")
1390      # Merge with the previous range if possible,
1391      # or remember this range for merging.
1392      if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
1393        prev_end = end
1394      else:
1395        if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
1396          if prev_start == prev_end:
1397            out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
1398          else:
1399            out_file.write("%04X..%04X>%s\n" %
1400                           (prev_start, prev_end, prev_nfkc_cf))
1401        prev_start = start
1402        prev_end = end
1403        prev_nfkc_cf = nfkc_cf
1404
1405
1406def WriteNorm2(path):
1407  WriteNorm2NFCTextFile(path)
1408  WriteNorm2NFKCTextFile(path)
1409  WriteNorm2NFKC_CFTextFile(path)
1410
1411# UTS #46 Normalizer2 input file ------------------------------------------- ***
1412
1413_idna_replacements = [
1414  # Several versions of avoiding circular FFFD>FFFD mappings,
1415  # depending on the version of the input file.
1416  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
1417  (re.compile(r"\.\.FFFD"), "..FFFC"),
1418  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
1419  # Since we switch between checking and not checking for STD3 character
1420  # restrictions at runtime, checking the non-LDH ASCII characters in code,
1421  # we treat these values here like their regular siblings.
1422  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
1423  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
1424  # For UTS #46, we do not care about "not valid in IDNA2008".
1425  (re.compile(r"; *; NV8 +"), ""),
1426  # ICU 63+ normalization no longer allows mappings for surrogate code points,
1427  # and the UTS #46 code handles them instead.
1428  (re.compile(r"^D800..DFFF    ; disallowed"), r"# D800..DFFF disallowed in code"),
1429  # Normal transformations.
1430  (re.compile(r"; disallowed"), ">FFFD"),
1431  (re.compile(r"; ignored"), ">"),
1432  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
1433  (re.compile(r"; mapped +; "), ">"),
1434  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
1435]
1436
1437def IdnaToUTS46TextFile(s, t):
1438  """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
1439  # Different input/output file names.
1440  dest_path = os.path.dirname(t)
1441  t = os.path.join(dest_path, "uts46.txt")
1442  # TODO: With Python 2.7+, combine the two with statements into one.
1443  with open(s, "r") as in_file:
1444    with open(t, "w") as out_file:
1445      out_file.write("# Original file:\n")
1446      for line in in_file:
1447        orig_line = line
1448        if line.startswith("# For documentation"):
1449          out_file.write(line)
1450          out_file.write(r"""
1451# ================================================
1452# This file has been reformatted into syntax for the
1453# gennorm2 Normalizer2 data generator tool.
1454#
1455# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
1456# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
1457# "disallowed" lines map to U+FFFD.
1458# "ignored" lines map to an empty string.
1459#
1460# Characters disallowed under STD3 rules are treated as valid or mapped;
1461# they are handled in code.
1462# Deviation characters are also handled in code.
1463#
1464# Use this file as the second gennorm2 input file after nfc.txt.
1465# ================================================
1466""")
1467          continue
1468        if line[0] in "#\r\n":
1469          out_file.write(line)
1470          continue
1471        for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
1472        # Align inline comments at column 40.
1473        comment_pos = line.find("#", 1)
1474        if comment_pos < 40:
1475          line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
1476                  line[comment_pos:])
1477        elif comment_pos > 40:
1478          space_pos = comment_pos
1479          while space_pos > 0 and line[space_pos - 1] == ' ':
1480            space_pos = space_pos - 1
1481          if space_pos < 40:
1482            # Fewer than 40 characters before the comment:
1483            # Align comments at column 40.
1484            line = line[:40] + line[comment_pos:]
1485          else:
1486            # 40 or more characters before the comment:
1487            # Keep one space between contents and comment.
1488            line = line[:space_pos] + " " + line[comment_pos:]
1489        # Write the modified line.
1490        out_file.write(line)
1491        if "..FFFF" in orig_line and "..FFFC" in line:
1492          out_file.write("FFFE..FFFF    >FFFD\n");
1493  return t
1494
1495# Preprocessing ------------------------------------------------------------ ***
1496
1497_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
1498_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
1499
1500def CopyAndStripWithOptionalMerge(s, t, do_merge):
1501  # TODO: We do not seem to need the do_merge argument and logic any more.
1502  with open(s, "r") as in_file, open(t, "w") as out_file:
1503    first = -1  # First code point with first_data.
1504    last = -1  # Last code point with first_data.
1505    first_data = ""  # Common data for code points [first..last].
1506    for line in in_file:
1507      match = _strip_re.match(line)
1508      if match:
1509        line = match.group(1)
1510      else:
1511        line = line.rstrip()
1512      if do_merge:
1513        match = _code_point_re.match(line)
1514        if match:
1515          c = int(match.group(1), 16)
1516          data = line[match.end() - 1:]
1517        else:
1518          c = -1
1519          data = ""
1520        if last >= 0 and (c != (last + 1) or data != first_data):
1521          # output the current range
1522          if first == last:
1523            out_file.write("%04X%s\n" % (first, first_data))
1524          else:
1525            out_file.write("%04X..%04X%s\n" % (first, last, first_data))
1526          first = -1
1527          last = -1
1528          first_data = ""
1529        if c < 0:
1530          # no data on this line, output as is
1531          out_file.write(line)
1532          out_file.write("\n")
1533        else:
1534          # data on this line, store for possible range compaction
1535          if last < 0:
1536            # set as the first line in a possible range
1537            first = c
1538            last = c
1539            first_data = data
1540          else:
1541            # must be c == (last + 1) and data == first_data
1542            # because of previous conditions
1543            # continue with the current range
1544            last = c
1545      else:
1546        # Only strip, don't merge: just output the stripped line.
1547        out_file.write(line)
1548        out_file.write("\n")
1549    if do_merge and last >= 0:
1550      # output the last range in the file
1551      if first == last:
1552        out_file.write("%04X%s\n" % (first, first_data))
1553      else:
1554        out_file.write("%04X..%04X%s\n" % (first, last, first_data))
1555      first = -1
1556      last = -1
1557      first_data = ""
1558    out_file.flush()
1559  return t
1560
1561
1562def CopyAndStrip(s, t):
1563  """Copies a file and removes comments behind data lines but not in others."""
1564  return CopyAndStripWithOptionalMerge(s, t, False)
1565
1566
1567def CopyAndStripAndMerge(s, t):
1568  """Copies and strips a file and merges lines.
1569
1570  Copies a file, removes comments, and
1571  merges lines with adjacent code point ranges and identical per-code point
1572  data lines into one line with range syntax.
1573  """
1574  return CopyAndStripWithOptionalMerge(s, t, True)
1575
1576
1577def CopyOnly(s, t):
1578  shutil.copy(s, t)
1579  return t
1580
1581
1582def DontCopy(s, t):
1583  return s
1584
1585
1586# Each _files value is a
1587# (preprocessor, dest_folder, parser, order) tuple
1588# where all fields except the preprocessor are optional.
1589# After the initial preprocessing (copy/strip/merge),
1590# if a parser is specified, then a tuple is added to _files_to_parse
1591# at index "order" (default order 9).
1592# An explicit order number is set only for files that must be parsed
1593# before others.
1594_files = {
1595  "BidiBrackets.txt": (DontCopy, ParseBidiBrackets),
1596  "BidiMirroring.txt": (DontCopy, ParseBidiMirroring),
1597  "BidiTest.txt": (CopyOnly, "testdata"),
1598  "Blocks.txt": (DontCopy, ParseBlocks),
1599  "CaseFolding.txt": (CopyOnly, ParseCaseFolding),
1600  "DerivedAge.txt": (DontCopy, ParseDerivedAge),
1601  "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass),
1602  "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties),
1603  "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup),
1604  "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType),
1605  "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties),
1606  "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues),
1607  "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
1608  "emoji-data.txt": (DontCopy, ParseNamedProperties),
1609  "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
1610  "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"),
1611  "IdnaTestV2.txt": (CopyOnly, "testdata"),
1612  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
1613  "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
1614  "LineBreak.txt": (DontCopy, ParseLineBreak),
1615  "LineBreakTest.txt": (CopyOnly, "testdata"),
1616  "NameAliases.txt": (DontCopy, ParseNameAliases),
1617  "NamesList.txt": (DontCopy, ParseNamesList),
1618  "NormalizationCorrections.txt": (CopyOnly,),  # Only used in gensprep.
1619  "NormalizationTest.txt": (CopyAndStrip,),
1620  "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0),
1621  "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1),
1622  "PropList.txt": (DontCopy, ParseNamedProperties),
1623  "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak),
1624  "SentenceBreakTest.txt": (CopyOnly, "testdata"),
1625  "Scripts.txt": (DontCopy, ParseScripts),
1626  "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions),
1627  "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
1628  "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
1629  "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation),
1630  "WordBreakProperty.txt": (DontCopy, ParseWordBreak),
1631  "WordBreakTest.txt": (CopyOnly, "testdata"),
1632  # From www.unicode.org/Public/idna/<version>/
1633  "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
1634}
1635
1636# List of lists of files to be parsed in order.
1637# Inner lists contain (basename, path, parser) tuples.
1638_files_to_parse = [[], [], [], [], [], [], [], [], [], []]
1639
1640# Get the standard basename from a versioned filename.
1641# For example, match "UnicodeData-6.1.0d8.txt"
1642# so we can turn it into "UnicodeData.txt".
1643_file_version_re = re.compile("([a-zA-Z0-9_-]+)" +
1644                              "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" +
1645                              "(\\.[a-z]+)$")
1646
1647def PreprocessFiles(source_files, icu4c_src_root):
1648  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
1649  norm2_path = os.path.join(unidata_path, "norm2")
1650  testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata")
1651  folder_to_path = {
1652    "unidata": unidata_path,
1653    "norm2": norm2_path,
1654    "testdata": testdata_path
1655  }
1656  files_processed = set()
1657  for source_file in source_files:
1658    (folder, basename) = os.path.split(source_file)
1659    match = _file_version_re.match(basename)
1660    if match:
1661      new_basename = match.group(1) + match.group(2)
1662      if new_basename != basename:
1663        print("Removing version suffix from " + source_file)
1664        # ... so that we can easily compare UCD files.
1665        new_source_file = os.path.join(folder, new_basename)
1666        shutil.move(source_file, new_source_file)
1667        basename = new_basename
1668        source_file = new_source_file
1669    if basename in _files:
1670      print("Preprocessing %s" % basename)
1671      if basename in files_processed:
1672        raise Exception("duplicate file basename %s!" % basename)
1673      files_processed.add(basename)
1674      value = _files[basename]
1675      preprocessor = value[0]
1676      if len(value) >= 2 and isinstance(value[1], (str)):
1677        # The value was [preprocessor, dest_folder, ...], leave [...].
1678        dest_folder = value[1]
1679        value = value[2:]
1680      else:
1681        # The value was [preprocessor, ...], leave [...].
1682        dest_folder = "unidata"
1683        value = value[1:]
1684      dest_path = folder_to_path[dest_folder]
1685      if not os.path.exists(dest_path): os.makedirs(dest_path)
1686      dest_basename = basename
1687      # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt.
1688      if basename.endswith("-cldr.txt"):
1689        dest_basename = basename[:-9] + basename[-4:]
1690      dest_file = os.path.join(dest_path, dest_basename)
1691      parse_file = preprocessor(source_file, dest_file)
1692      if value:
1693        order = 9 if len(value) < 2 else value[1]
1694        _files_to_parse[order].append((basename, parse_file, value[0]))
1695
1696# Character names ---------------------------------------------------------- ***
1697
1698# TODO: Turn this script into a module that
1699# a) gives access to the parsed data
1700# b) has a PreparseUCD(ucd_root, icu4c_src_root) function
1701# c) has a ParsePreparsedUCD(filename) function
1702# d) has a WritePreparsedUCD(filename) function
1703# and then use it from a new script for names.
1704# Some more API:
1705# - generator GetRangesAndProps() -> (start, end, props)*
1706
1707def IncCounter(counters, key, inc=1):
1708  if key in counters:
1709    counters[key] += inc
1710  else:
1711    counters[key] = inc
1712
1713
1714endings = (
1715  # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz.
1716  "PHASE-",
1717  "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ",
1718  "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ",
1719  "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ",
1720  "ACROPHONIC ", "HIEROGLYPH ",
1721  "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ",
1722  "PUNCTUATION ", "SIGN ", "SYMBOL ",
1723  "TILE ", "CARD ", "FACE ",
1724  "ACCENT ", "POINT ",
1725  # List SIGN before VOWEL to catch "vowel sign".
1726  "VOWEL ", "TONE ", "RADICAL ",
1727  # For names of math symbols,
1728  # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A
1729  "SCRIPT ", "FRAKTUR ", "MONOSPACE ",
1730  "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ",
1731  "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ",
1732  # BRAILLE PATTERN DOTS-xyz
1733  "DOTS-",
1734  "SELECTOR ", "SELECTOR-"
1735)
1736
1737def SplitName(name, tokens):
1738  start = 0
1739  for e in endings:
1740    i = name.find(e)
1741    if i >= 0:
1742      start = i + len(e)
1743      token = name[:start]
1744      IncCounter(tokens, token)
1745      break
1746  for i in range(start, len(name)):
1747    c = name[i]
1748    if c == ' ' or c == '-':
1749      token = name[start:i + 1]
1750      IncCounter(tokens, token)
1751      start = i + 1
1752  IncCounter(tokens, name[start:])
1753
1754
1755def PrintNameStats():
1756  # TODO: This name analysis code is out of date.
1757  # It needs to consider the multi-type Name_Alias values.
1758  name_pnames = ("na", "na1", "Name_Alias")
1759  counts = {}
1760  for pname in name_pnames:
1761    counts[pname] = 0
1762  total_lengths = counts.copy()
1763  max_length = 0
1764  max_per_cp = 0
1765  name_chars = set()
1766  num_digits = 0
1767  token_counters = {}
1768  char_counters = {}
1769  for i in range(len(_starts) - 1):
1770    start = _starts[i]
1771    # end = _starts[i + 1] - 1
1772    props = _props[i]
1773    per_cp = 0
1774    for pname in name_pnames:
1775      if pname in props:
1776        counts[pname] += 1
1777        name = props[pname]
1778        total_lengths[pname] += len(name)
1779        name_chars |= set(name)
1780        if len(name) > max_length: max_length = len(name)
1781        per_cp += len(name) + 1
1782        if per_cp > max_per_cp: max_per_cp = per_cp
1783        tokens = SplitName(name, token_counters)
1784        for c in name:
1785          if c in "0123456789": num_digits += 1
1786          IncCounter(char_counters, c)
1787  print
1788  for pname in name_pnames:
1789    print("'%s' character names: %d / %d bytes" %
1790          (pname, counts[pname], total_lengths[pname]))
1791  print("%d total bytes in character names" % sum(total_lengths.itervalues()))
1792  print("%d name-characters: %s" %
1793        (len(name_chars), "".join(sorted(name_chars))))
1794  print("%d digits 0-9" % num_digits)
1795  count_chars = [(count, c) for (c, count) in char_counters.items()]
1796  count_chars.sort(reverse=True)
1797  for cc in count_chars:
1798    print("name-chars: %6d * '%s'" % cc)
1799  print("max. name length: %d" % max_length)
1800  print("max. length of all (names+NUL) per cp: %d" % max_per_cp)
1801
1802  token_lengths = sum([len(t) + 1 for t in token_counters])
1803  print("%d total tokens, %d bytes with NUL" %
1804        (len(token_counters), token_lengths))
1805
1806  counts_tokens = []
1807  for (token, count) in token_counters.items():
1808    # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time
1809    # but have to store the token string itself with a length or terminator byte,
1810    # plus a 2-byte entry in an token index table.
1811    savings = count * (len(token) - 1) - (len(token) + 1 + 2)
1812    if savings > 0:
1813      counts_tokens.append((savings, count, token))
1814  counts_tokens.sort(reverse=True)
1815  print("%d tokens might save space with 1-byte codes" % len(counts_tokens))
1816
1817  # Codes=bytes, 40 byte values for name_chars.
1818  # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens.
1819  # Make each 2-byte token the token string index itself, rather than
1820  # and index into a string index table.
1821  # More lead bytes but also more savings.
1822  num_units = 256
1823  max_lead = (token_lengths + 255) / 256
1824  max_token_units = num_units - len(name_chars)
1825  results = []
1826  for num_lead in range(min(max_lead, max_token_units) + 1):
1827    max1 = max_token_units - num_lead
1828    ct = counts_tokens[:max1]
1829    tokens1 = set([t for (s, c, t) in ct])
1830    for (token, count) in token_counters.items():
1831      if token in tokens1: continue
1832      # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time
1833      # but have to store the token string itself with a length or terminator byte.
1834      savings = count * (len(token) - 2) - (len(token) + 1)
1835      if savings > 0:
1836        ct.append((savings, count, token))
1837    ct.sort(reverse=True)
1838    # A 2-byte-code-token index cannot be limit_t_lengths or higher.
1839    limit_t_lengths = num_lead * 256
1840    token2_index = 0
1841    for i in range(max1, len(ct)):
1842      if token2_index >= limit_t_lengths:
1843        del ct[i:]
1844        break
1845      token2_index += len(ct[i][2]) + 1
1846    cumul_savings = sum([s for (s, c, t) in ct])
1847    # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" %
1848    #        (max1, len(ct), cumul_savings))
1849    results.append((cumul_savings, max1, ct))
1850  best = max(results)  # (cumul_savings, max1, ct)
1851
1852  max1 = best[1]
1853  print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" %
1854         (best[0], max1, max_token_units - max1))
1855  counts_tokens = best[2]
1856  cumul_savings = 0
1857  for i in range(len(counts_tokens)):
1858    n = 1 if i < max1 else 2
1859    i1 = i + 1
1860    t = counts_tokens[i]
1861    cumul_savings += t[0]
1862    if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens):
1863      print(("%04d. cumul. %6d bytes save %6d bytes from " +
1864              "%5d * %d-byte token for %2d='%s'") %
1865          (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
1866
1867# ICU API ------------------------------------------------------------------ ***
1868
1869# Sample line to match:
1870#    UCHAR_UNIFIED_IDEOGRAPH=29,
1871_uchar_re = re.compile(
1872    " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
1873
1874# Sample line to match:
1875#    /** Zs @stable ICU 2.0 */
1876_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
1877
1878# Sample line to match:
1879#    U_SPACE_SEPARATOR         = 12,
1880_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
1881
1882# Sample line to match:
1883#    /** L @stable ICU 2.0 */
1884_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
1885
1886# Sample line to match:
1887#    U_LEFT_TO_RIGHT               = 0,
1888_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
1889
1890# Sample line to match:
1891#    UBLOCK_CYRILLIC =9,
1892_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
1893
1894# Sample line to match:
1895#    U_EA_AMBIGUOUS,
1896_prop_and_value_re = re.compile(
1897    " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
1898
1899# Sample line to match if it has matched _prop_and_value_re
1900# (we want to exclude aliases):
1901#    U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
1902_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
1903
1904def ParseUCharHeader(icu4c_src_root):
1905  uchar_path = os.path.join(icu4c_src_root, "source",
1906                            "common", "unicode", "uchar.h")
1907  with open(uchar_path, "r") as uchar_file:
1908    mode = ""  # Mode string (=pname) during context-sensitive parsing.
1909    comment_value = ""  # Property value from a comment preceding an enum.
1910    # Note: The enum UProperty is first in uchar.h, before the enums for values.
1911    for line in uchar_file:
1912      # Parse some enums via context-sensitive "modes".
1913      # Necessary because the enum constant names do not contain
1914      # enough information.
1915      if "enum UCharCategory" in line:
1916        mode = "gc"
1917        comment_value = ""
1918        continue
1919      if mode == "gc":
1920        if line.startswith("}"):
1921          mode = ""
1922          continue
1923        match = _gc_comment_re.match(line)
1924        if match:
1925          comment_value = match.group(1)
1926          continue
1927        match = _gc_re.match(line)
1928        if match and comment_value:
1929          gc_enum = match.group(1)
1930          prop = _properties["gc"]
1931          vname = GetShortPropertyValueName(prop, comment_value)
1932          icu_values = _pname_to_icu_prop["gc"][2]
1933          icu_values.append((gc_enum, vname))
1934        comment_value = ""
1935        continue
1936      if "enum UCharDirection {" in line:
1937        mode = "bc"
1938        comment_value = ""
1939        continue
1940      if mode == "bc":
1941        if line.startswith("}"):
1942          mode = ""
1943          continue
1944        match = _bc_comment_re.match(line)
1945        if match:
1946          comment_value = match.group(1)
1947          continue
1948        match = _bc_re.match(line)
1949        if match and comment_value:
1950          bc_enum = match.group(1)
1951          prop = _properties["bc"]
1952          vname = GetShortPropertyValueName(prop, comment_value)
1953          icu_values = _pname_to_icu_prop["bc"][2]
1954          icu_values.append((bc_enum, vname))
1955        comment_value = ""
1956        continue
1957      # No mode, parse enum constants whose names contain
1958      # enough information to parse without requiring context.
1959      match = _uchar_re.match(line)
1960      if match:
1961        prop_enum = match.group(1)
1962        if prop_enum.endswith("_LIMIT"):
1963          # Ignore "UCHAR_BINARY_LIMIT=57," etc.
1964          continue
1965        pname = GetShortPropertyName(prop_enum[6:])
1966        icu_prop = (prop_enum, pname, [])
1967        _icu_properties.append(icu_prop)
1968        _pname_to_icu_prop[pname] = icu_prop
1969        continue
1970      match = _ublock_re.match(line)
1971      if match:
1972        prop_enum = match.group(1)
1973        if prop_enum == "UBLOCK_COUNT":
1974          continue
1975        prop = _properties["blk"]
1976        vname = GetShortPropertyValueName(prop, prop_enum[7:])
1977        icu_values = _pname_to_icu_prop["blk"][2]
1978        icu_values.append((prop_enum, vname))
1979        continue
1980      match = _prop_and_value_re.match(line)
1981      if match:
1982        (prop_enum, vname) = match.group(1, 3)
1983        if vname == "COUNT" or _prop_and_alias_re.match(line):
1984          continue
1985        pname = GetShortPropertyName(match.group(2))
1986        prop = _properties[pname]
1987        vname = GetShortPropertyValueName(prop, vname)
1988        icu_values = _pname_to_icu_prop[pname][2]
1989        icu_values.append((prop_enum, vname))
1990  # ccc, lccc, tccc use their numeric values as "enum" values.
1991  # In the UCD data, these numeric values are the first value names,
1992  # followed by the short & long value names.
1993  # List the ccc values in numeric order.
1994  prop = _properties["ccc"]
1995  icu_values = _pname_to_icu_prop["ccc"][2]
1996  for ccc in sorted([int(name) for name in prop[2]]):
1997    icu_values.append((ccc, str(ccc)))
1998  _pname_to_icu_prop["lccc"][2].extend(icu_values)  # Copy ccc -> lccc.
1999  _pname_to_icu_prop["tccc"][2].extend(icu_values)  # Copy ccc -> tccc.
2000
2001  # No need to parse predictable General_Category_Mask enum constants.
2002  # Just define them in ASCII order.
2003  prop = _properties["gcm"]
2004  icu_values = _pname_to_icu_prop["gcm"][2]
2005  for vname in sorted(prop[2]):
2006    icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
2007  # Hardcode known values for the normalization quick check properties,
2008  # see unorm2.h for the UNormalizationCheckResult enum.
2009  icu_values = _pname_to_icu_prop["NFC_QC"][2]
2010  icu_values.append(("UNORM_NO", "N"))
2011  icu_values.append(("UNORM_YES", "Y"))
2012  icu_values.append(("UNORM_MAYBE", "M"))
2013  _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values)  # Copy NFC -> NFKC.
2014  # No "maybe" values for NF[K]D.
2015  icu_values = _pname_to_icu_prop["NFD_QC"][2]
2016  icu_values.append(("UNORM_NO", "N"))
2017  icu_values.append(("UNORM_YES", "Y"))
2018  _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values)  # Copy NFD -> NFKD.
2019
2020
2021# Sample line to match:
2022#    USCRIPT_LOMA   = 139,/* Loma */
2023_uscript_re = re.compile(
2024    " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
2025
2026def ParseUScriptHeader(icu4c_src_root):
2027  uscript_path = os.path.join(icu4c_src_root, "source",
2028                              "common", "unicode", "uscript.h")
2029  icu_values = _pname_to_icu_prop["sc"][2]
2030  with open(uscript_path, "r") as uscript_file:
2031    for line in uscript_file:
2032      match = _uscript_re.match(line)
2033      if match:
2034        (script_enum, script_code) = match.group(1, 2)
2035        icu_values.append((script_enum, script_code))
2036
2037
2038def CheckPNamesData():
2039  """Checks that every ICU property has a full set of value enum constants,
2040  and that the _icu_properties value names map back to the UCD."""
2041  missing_enums = []
2042  for (p_enum, pname, values) in _icu_properties:
2043    prop = _properties[pname]
2044    vnames = set(prop[2])  # Modifiable copy of the set of short value names.
2045    for (v_enum, vname) in values:
2046      if vname not in vnames:
2047        raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
2048                         (pname, vname, v_enum))
2049      vnames.remove(vname)
2050    # Exceptions to the all-values check:
2051    # - ICU does not have specific enum values for binary No/Yes.
2052    # - ICU represents Age values via UVersionInfo rather than enum constants.
2053    # - gc: ICU enum UCharCategory only has the single-category values.
2054    #       (ICU's gcm property has all of the UCD gc property values.)
2055    if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
2056      missing_enums.append((pname, vnames))
2057  if missing_enums:
2058    raise ValueError(
2059        "missing uchar.h enum constants for some property values: %s" %
2060        missing_enums)
2061
2062
2063def WritePNamesDataHeader(out_path):
2064  with open(out_path, "w") as out_file:
2065    out_file.write("""// © 2016 and later: Unicode, Inc. and others.
2066// License & terms of use: http://www.unicode.org/copyright.html
2067/**
2068 * Copyright (C) 2002-2016, International Business Machines Corporation and
2069 * others. All Rights Reserved.
2070 *
2071 * machine-generated by: icu/tools/unicode/py/preparseucd.py
2072 */
2073
2074""")
2075
2076    # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties
2077    # and values in the order of their definition,
2078    # and this function writes them in that order.
2079    # Since the ICU API constants are stable and new values are only
2080    # appended at the end
2081    # (new properties are added at the end of each binary/enum/... range),
2082    # the output is stable as well.
2083    # When a property or value constant is renamed,
2084    # it only changes the name itself in the output;
2085    # it does not move in the output since there is no sorting.
2086    # This minimizes diffs and assists with reviewing and evaluating updates.
2087
2088    version = _ucd_version.split('.')
2089    while len(version) < 4: version.append("0")
2090    out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version))
2091
2092    # Count the maximum number of aliases for any property or value.
2093    # We write the final value at the end.
2094    max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"]))
2095
2096    # Write an array of "binprop" Value object initializers
2097    # with the value aliases shared among all binary properties.
2098    out_file.write("static const Value VALUES_binprop[2] = {\n")
2099    out_file.write('    Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
2100    out_file.write('    Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
2101    out_file.write("};\n\n")
2102
2103    # For each property with named values, write an array of
2104    # Value object initializers with the value enum and the aliases.
2105    for (p_enum, pname, values) in _icu_properties:
2106      prop = _properties[pname]
2107      aliases = prop[1]
2108      if len(aliases) > max_aliases: max_aliases = len(aliases)
2109      if not values: continue
2110      out_file.write("static const Value VALUES_%s[%d] = {\n" %
2111                     (pname, len(values)))
2112      for (v_enum, vname) in values:
2113        aliases = _properties[pname][3][vname]
2114        # ccc, lccc, tccc: Omit the numeric strings from the aliases.
2115        # (See the comment about ccc in the PropertyValueAliases.txt header.)
2116        if pname.endswith("ccc"): aliases = aliases[1:]
2117        if len(aliases) > max_aliases: max_aliases = len(aliases)
2118        cast = "(int32_t)" if pname == "gcm" else ""
2119        out_file.write('    Value(%s%s, "%s"),\n' %
2120                       (cast, v_enum, " ".join(aliases)))
2121      out_file.write("};\n\n")
2122
2123    # For each property, write a Property object initializer
2124    # with the property enum, its aliases, and a reference to its values.
2125    out_file.write("static const Property PROPERTIES[%d] = {\n" %
2126                   len(_icu_properties))
2127    for (enum, pname, values) in _icu_properties:
2128      prop = _properties[pname]
2129      aliases = " ".join(prop[1])
2130      if prop[0] == "Binary":
2131        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
2132      elif values:  # Property with named values.
2133        out_file.write('    Property(%s, "%s", VALUES_%s, %d),\n' %
2134                       (enum, aliases, pname, len(values)))
2135      else:
2136        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
2137    out_file.write("};\n\n")
2138
2139    out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases)
2140
2141# main() ------------------------------------------------------------------- ***
2142
2143def main():
2144  global _null_or_defaults
2145  only_ppucd = False
2146  if len(sys.argv) == 3:
2147    (ucd_root, icu_src_root) = sys.argv[1:3]
2148    ppucd_path = None
2149  elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd":
2150    # For debugging:
2151    # preparseucd.py  path/to/UCD/root  --only_ppucd  path/to/ppucd/outputfile
2152    ucd_root = sys.argv[1]
2153    ppucd_path = sys.argv[3]
2154    only_ppucd = True
2155    icu_src_root = "/tmp/ppucd"
2156  else:
2157    print("Usage: %s  path/to/UCD/root  path/to/ICU/src/root" % sys.argv[0])
2158    return
2159  icu4c_src_root = os.path.join(icu_src_root, "icu4c")
2160  icu_tools_root = os.path.join(icu_src_root, "tools")
2161  source_files = []
2162  for root, dirs, files in os.walk(ucd_root):
2163    for file in files:
2164      source_files.append(os.path.join(root, file))
2165  PreprocessFiles(source_files, icu4c_src_root)
2166  # Parse the processed files in a particular order.
2167  for files in _files_to_parse:
2168    for (basename, path, parser) in files:
2169      print("Parsing %s" % basename)
2170      value = _files[basename]
2171      # Unicode data files are in UTF-8.
2172      charset = "UTF-8"
2173      if basename == "NamesList.txt":
2174        # The NamesList used to be in Latin-1 before Unicode 6.2.
2175        numeric_ucd_version = [int(field) for field in _ucd_version.split('.')]
2176        if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1"
2177      in_file = codecs.open(path, "r", charset)
2178      with in_file:
2179        parser(in_file)
2180  _null_or_defaults = _null_values.copy()
2181  _null_or_defaults.update(_defaults)
2182  # Every Catalog and Enumerated property must have a default value,
2183  # from a @missing line. "nv" = "null value".
2184  pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"]
2185  if pnv:
2186    raise Exception("no default values (@missing lines) for " +
2187                    "some Catalog or Enumerated properties: %s " % pnv)
2188  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
2189  if not only_ppucd:
2190    # Write Normalizer2 input text files.
2191    # Do this before compacting the data so that we need not handle fallbacks.
2192    norm2_path = os.path.join(unidata_path, "norm2")
2193    if not os.path.exists(norm2_path): os.makedirs(norm2_path)
2194    WriteNorm2(norm2_path)
2195  # Optimize block vs. cp properties.
2196  CompactBlocks()
2197  # Write the ppucd.txt output file.
2198  # Use US-ASCII so that ICU tests can parse it in the platform charset,
2199  # which may be EBCDIC.
2200  # Fix up non-ASCII data (NamesList.txt headings) to fit.
2201  if not ppucd_path:
2202    ppucd_path = os.path.join(unidata_path, "ppucd.txt")
2203  with codecs.open(ppucd_path, "w", "US-ASCII") as out_file:
2204    WritePreparsedUCD(out_file)
2205    out_file.flush()
2206
2207  # TODO: PrintNameStats()
2208
2209  if only_ppucd: return
2210
2211  # ICU data for property & value names API
2212  ParseUCharHeader(icu4c_src_root)
2213  ParseUScriptHeader(icu4c_src_root)
2214  CheckPNamesData()
2215  genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
2216  if not os.path.exists(genprops_path): os.makedirs(genprops_path)
2217  out_path = os.path.join(genprops_path, "pnames_data.h")
2218  WritePNamesDataHeader(out_path)
2219
2220
2221if __name__ == "__main__":
2222  main()
2223