• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python3 -B
2# -*- coding: utf-8 -*-
3# © 2016 and later: Unicode, Inc. and others.
4# License & terms of use: http://www.unicode.org/copyright.html
5# Copyright (c) 2009-2016 International Business Machines
6# Corporation and others. All Rights Reserved.
7#
8#   file name:  preparseucd.py
9#   encoding:   US-ASCII
10#   tab size:   8 (not used)
11#   indentation:4
12#
13#   created on: 2011nov03 (forked from ucdcopy.py)
14#   created by: Markus W. Scherer
15#
16# Copies Unicode Character Database (UCD) files from a tree
17# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/
18# to ICU's source/data/unidata/ and source/test/testdata/
19# and modifies some of the files to make them more compact.
20# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
21#
22# Invoke with two command-line parameters:
23# 1. source folder with UCD & idna files
24# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools)
25#
26# Sample invocation:
27#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src
28
29import array
30import bisect
31import codecs
32import os
33import os.path
34import re
35import shutil
36import sys
37
38# Unicode version ---------------------------------------------------------- ***
39
40_ucd_version = "?"
41
42# ISO 15924 script codes --------------------------------------------------- ***
43
44# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
45# that are not yet in the UCD.
46_scripts_only_in_iso15924 = (
47    "Afak", "Blis", "Cirt", "Cyrs",
48    "Egyd", "Egyh", "Geok",
49    "Hanb", "Hans", "Hant",
50    "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
51    "Maya", "Moon", "Nkgb", "Phlv", "Roro",
52    "Sara", "Syre", "Syrj", "Syrn",
53    "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx"
54)
55
56# Properties --------------------------------------------------------------- ***
57
58# Properties that we do not want to store in ppucd.txt.
59# Not a frozenset so that we can add aliases for simpler subsequent testing.
60_ignored_properties = set((
61  # Other_Xyz only contribute to Xyz, store only the latter.
62  "OAlpha",
63  "ODI",
64  "OGr_Ext",
65  "OIDC",
66  "OIDS",
67  "OLower",
68  "OMath",
69  "OUpper",
70  # Further properties that just contribute to others.
71  "CE",  # Composition_Exclusion just contributes to Full_Composition_Exclusion.
72  "JSN",
73  # These properties just don't seem useful.
74  # They are deprecated since Unicode 6.0.
75  "XO_NFC",
76  "XO_NFD",
77  "XO_NFKC",
78  "XO_NFKD",
79  # ICU does not use Unihan properties.
80  "cjkAccountingNumeric",
81  "cjkOtherNumeric",
82  "cjkPrimaryNumeric",
83  "cjkCompatibilityVariant",
84  "cjkIICore",
85  "cjkIRG_GSource",
86  "cjkIRG_HSource",
87  "cjkIRG_JSource",
88  "cjkIRG_KPSource",
89  "cjkIRG_KSource",
90  "cjkIRG_MSource",
91  "cjkIRG_SSource",
92  "cjkIRG_TSource",
93  "cjkIRG_UKSource",
94  "cjkIRG_USource",
95  "cjkIRG_VSource",
96  "cjkRSUnicode"
97))
98
99# These properties (short names) map code points to
100# strings or other unusual values (property types String or Miscellaneous)
101# that cannot be block-compressed (or would be confusing).
102_uncompressible_props = frozenset((
103  "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC",
104  "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF",
105  # scx is block-compressible.
106  "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc"
107))
108
109# Dictionary of properties.
110# Keyed by normalized property names and aliases.
111# Each value is a tuple with
112# 0: Type of property (binary, enum, ...)
113# 1: List of aliases; short & long name followed by other aliases.
114#    The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
115# 2: Set of short property value names.
116# 3: Dictionary of property values.
117#    For Catalog & Enumerated properties,
118#    maps each value name to a list of aliases.
119#    Empty for other types of properties.
120_properties = {}
121
122# Dictionary of binary-property values which we store as False/True.
123# Same as the values dictionary of one of the binary properties.
124_binary_values = {}
125
126# Dictionary of null values.
127# Keyed by short property names.
128# These are type-specific values for properties that occur in the data.
129# They are overridden by _defaults, block and code point properties.
130_null_values = {}
131
132# Property value names for null values.
133# We do not store these in _defaults.
134_null_names = frozenset(("<none>", "NaN"))
135
136# Dictionary of explicit default property values.
137# Keyed by short property names.
138_defaults = {"gc": "Cn"}
139
140# _null_values overridden by explicit _defaults.
141# Initialized after parsing is done.
142_null_or_defaults = {}
143
144# List of properties with an ICU UProperty enum.
145# Each item is an (enum, pname, values) tuple.
146# - enum: the ICU enum UProperty constant string
147# - pname: the UCD short property name
148# - values: list of (enum, vname) pairs per property value
149#   - enum: the ICU property value's enum constant string
150#   - vname: the UCD short property value name
151_icu_properties = []
152
153# Dictionary of short property names mapped to _icu_properties items.
154_pname_to_icu_prop = {}
155
156_non_alnum_re = re.compile("[^a-zA-Z0-9]")
157
158def NormPropName(pname):
159  """Returns a normalized form of pname.
160  Removes non-ASCII-alphanumeric characters and lowercases letters."""
161  return _non_alnum_re.sub("", pname).lower()
162
163
164def GetProperty(pname):
165  """Returns the _properties value for the pname.
166  Returns null if the property is ignored.
167  Caches alternate spellings of the property name."""
168  # Try the input name.
169  prop = _properties.get(pname)
170  if prop != None: return prop
171  if pname in _ignored_properties: return None
172  # Try the normalized input name.
173  norm_name = NormPropName(pname)
174  prop = _properties.get(norm_name)
175  if prop != None:
176    _properties[pname] = prop  # Cache prop under this new name spelling.
177    return prop
178  elif pname in _ignored_properties:
179    _ignored_properties.add(pname)  # Remember to ignore this new name spelling.
180    return None
181  else:
182    raise NameError("unknown property %s\n" % pname)
183
184
185def GetShortPropertyName(pname):
186  if pname in _null_values: return pname  # pname is already the short name.
187  prop = GetProperty(pname)
188  if not prop: return ""  # For ignored properties.
189  return prop[1][0] or prop[1][1]  # Long name if no short name.
190
191
192def GetShortPropertyValueName(prop, vname):
193  if vname in prop[2]: return vname
194  values = prop[3]
195  aliases = values.get(vname)
196  if aliases == None:
197    norm_name = NormPropName(vname)
198    aliases = values.get(norm_name)
199    if aliases == None:
200      raise NameError("unknown value name %s for property %s\n" %
201                      (vname, prop[1][0]))
202    values[vname] = aliases
203  return aliases[0] or aliases[1]  # Long name if no short name.
204
205
206def NormalizePropertyValue(prop, vname):
207  if prop[2]:  # Binary/Catalog/Enumerated property.
208    value = GetShortPropertyValueName(prop, vname)
209    if prop[0] == "Binary":
210      value = value == "Y"
211    if prop[1][0].endswith("ccc"):
212      value = int(value)
213  else:
214    value = vname
215  return value
216
217# Character data ----------------------------------------------------------- ***
218
219# Lists of NamesList h1 and h2 headings.
220# Each h1 value is a (start, end, comment) tuple.
221# Each h2 value is a (cp, comment) tuple.
222_h1 = []
223_h2 = []
224
225# List of Unicode blocks.
226# Each item is a tuple of start & end code point integers
227# and a dictionary of default property values.
228_blocks = []
229
230# List of ranges with algorithmic names.
231# Each value is a list of [start, end, type, prefix]
232# where prefix is optional.
233_alg_names_ranges = []
234
235# List of Unicode character ranges and their properties,
236# stored as an inversion map with range_start & props dictionary.
237# Starts with one range for all of Unicode without any properties.
238# Setting values subdivides ranges.
239_starts = array.array('l', [0, 0x110000])  # array of int32_t
240_props = [{}, {}]  # props for 0 and 110000
241
242def FindRange(x):
243  """ Binary search for x in the inversion map.
244  Returns the smallest i where x < _starts[i]"""
245  return bisect.bisect(_starts, x) - 1
246
247
248def GetProps(c):
249  i = FindRange(c)
250  return _props[i]
251
252
253def UpdateProps(start, end, update):
254  assert 0 <= start <= end <= 0x10ffff
255  (need_to_update, do_update, u) = (update[0], update[1], update[2])
256  # Find the index i of the range in _starts that contains start.
257  i = FindRange(start)
258  limit = end + 1
259  # Intersect [start, limit[ with ranges in _starts.
260  c_start = _starts[i]
261  c_limit = _starts[i + 1]
262  c_props = _props[i]
263  # c_start <= start < c_limit
264  if c_start < start:
265    update_limit = c_limit if c_limit <= limit else limit
266    if need_to_update(u, start, update_limit - 1, c_props):
267      # Split off [c_start, start[ with a copy of c_props.
268      i += 1
269      c_props = c_props.copy()
270      _starts.insert(i, start)
271      _props.insert(i, c_props)
272      c_start = start
273  # Modify all ranges that are fully inside [start, limit[.
274  while c_limit <= limit:
275    # start <= c_start < c_limit <= limit
276    if need_to_update(u, c_start, c_limit - 1, c_props):
277      do_update(u, c_start, c_limit - 1, c_props)
278    if c_limit == 0x110000: return
279    i += 1
280    c_start = c_limit
281    c_limit = _starts[i + 1]
282    c_props = _props[i]
283  if c_start < limit and need_to_update(u, c_start, limit - 1, c_props):
284    # Split off [limit, c_limit[ with a copy of c_props.
285    _starts.insert(i + 1, limit)
286    _props.insert(i + 1, c_props.copy())
287    # Modify [c_start, limit[ c_props.
288    do_update(u, c_start, limit - 1, c_props)
289
290
291def NeedToSetProps(props, start, end, c_props):
292  """Returns True if props is not a sub-dict of c_props."""
293  for (pname, value) in props.items():
294    if pname not in c_props or value != c_props[pname]: return True
295  return False
296
297
298def DoSetProps(props, start, end, c_props):
299  c_props.update(props)
300
301
302def SetProps(start, end, props):
303  UpdateProps(start, end, (NeedToSetProps, DoSetProps, props))
304
305
306def NeedToSetAlways(nv, start, end, c_props):
307  return True
308
309
310# For restoring boundaries after merging adjacent same-props ranges.
311def AddBoundary(x):
312  """Ensure that there is a range start/limit at x."""
313  assert 0 <= x <= 0x10ffff
314  i = FindRange(x)
315  if _starts[i] == x: return
316  # Split the range at x.
317  c_start = _starts[i]
318  c_limit = _starts[i + 1]
319  c_props = _props[i]
320  # c_start < x < c_limit
321  i += 1
322  _starts.insert(i, x)
323  _props.insert(i, c_props.copy())
324
325
326def SetDefaultValue(pname, value):
327  """Sets the property's default value. Ignores null values."""
328  prop = GetProperty(pname)
329  if prop and value not in _null_names:
330    value = NormalizePropertyValue(prop, value)
331    if value != _null_values[prop[1][0]]:
332      _defaults[prop[1][0]] = value
333      SetProps(0, 0x10ffff, {prop[1][0]: value})
334
335
336def SetBinaryPropertyToTrue(pname, start, end):
337  prop = GetProperty(pname)
338  if prop:
339    assert prop[0] == "Binary"
340    SetProps(start, end, {prop[1][0]: True})
341
342
343def SetPropValue(prop, vname, start, end):
344  value = NormalizePropertyValue(prop, vname)
345  SetProps(start, end, {prop[1][0]: value})
346
347
348def SetPropertyValue(pname, vname, start, end):
349  prop = GetProperty(pname)
350  if prop: SetPropValue(prop, vname, start, end)
351
352# Parsing ------------------------------------------------------------------ ***
353
354_stripped_cp_re = re.compile("([0-9a-fA-F]+)$")
355_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$")
356# Default value for all of Unicode.
357_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$")
358# Default value for some range.
359_missing2_re = re.compile("# *@missing: *(.+)$")
360
361def ReadUCDLines(in_file, want_ranges=True, want_other=False,
362                 want_comments=False, want_missing=False):
363  """Parses lines from a semicolon-delimited UCD text file.
364  Strips comments, ignores empty and all-comment lines.
365  Returns a tuple (type, line, ...).
366  """
367  for line in in_file:
368    line = line.strip()
369    if not line: continue
370    if line.startswith("#"):  # whole-line comment
371      parse_data = False
372      if want_missing:
373        match = _missing_re.match(line)
374        if match:
375          fields = match.group(1).split(";")
376          for i in range(len(fields)): fields[i] = fields[i].strip()
377          yield ("missing", line, fields)
378          continue
379        match = _missing2_re.match(line)
380        if match:
381          # Strip the "missing" comment prefix and fall through to
382          # parse the remainder of the line like regular data.
383          parse_data = True
384          line = match.group(1)
385      if not parse_data:
386        if want_comments: yield ("comment", line)
387        continue
388    comment_start = line.find("#")  # inline comment
389    if comment_start >= 0:
390      line = line[:comment_start].rstrip()
391      if not line: continue
392    fields = line.split(";")
393    for i in range(len(fields)): fields[i] = fields[i].strip()
394    if want_ranges:
395      first = fields[0]
396      match = _stripped_range_re.match(first)
397      if match:
398        start = int(match.group(1), 16)
399        end = int(match.group(2), 16)
400        yield ("range", line, start, end, fields)
401        continue
402      match = _stripped_cp_re.match(first)
403      if match:
404        c = int(match.group(1), 16)
405        yield ("range", line, c, c, fields)
406        continue
407    if want_other:
408      yield ("other", line, fields)
409    else:
410      raise SyntaxError("unable to parse line\n  %s\n" % line)
411
412
413def AddBinaryProperty(short_name, long_name):
414  _null_values[short_name] = False
415  bin_prop = _properties["Math"]
416  prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
417  _properties[short_name] = prop
418  _properties[long_name] = prop
419  _properties[NormPropName(short_name)] = prop
420  _properties[NormPropName(long_name)] = prop
421
422
423def AddSingleNameBinaryProperty(name):
424  # For some properties, the short name is the same as the long name.
425  _null_values[name] = False
426  bin_prop = _properties["Math"]
427  prop = ("Binary", [name, name], bin_prop[2], bin_prop[3])
428  _properties[name] = prop
429  _properties[NormPropName(name)] = prop
430
431
432def AddPOSIXBinaryProperty(name):
433  # We only define a long name for ICU-specific (non-UCD) POSIX properties.
434  _null_values[name] = False
435  bin_prop = _properties["Math"]
436  prop = ("Binary", ["", name], bin_prop[2], bin_prop[3])
437  _properties[name] = prop
438  _properties[NormPropName(name)] = prop
439  # This is to match UProperty UCHAR_POSIX_ALNUM etc.
440  _properties["posix" + NormPropName(name)] = prop
441
442
443# Match a comment line like
444# PropertyAliases-6.1.0.txt
445# and extract the Unicode version.
446_ucd_version_re = re.compile("# *PropertyAliases" +
447                             "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" +
448                             "\\.txt")
449
450def ParsePropertyAliases(in_file):
451  global _ucd_version
452  prop_type_nulls = {
453    "Binary": False,
454    "Catalog": "??",  # Must be specified, e.g., in @missing line.
455    "Enumerated": "??",  # Must be specified.
456    "Numeric": "NaN",
457    "String": "",
458    "Miscellaneous": ""
459  }
460  for data in ReadUCDLines(in_file, want_ranges=False,
461                           want_other=True, want_comments=True):
462    if data[0] == "comment":
463      line = data[1]
464      match = _ucd_version_re.match(line)
465      if match:
466        _ucd_version = match.group(1)
467      else:
468        words = line[1:].lstrip().split()
469        if len(words) == 2 and words[1] == "Properties":
470          prop_type = words[0]
471          null_value = prop_type_nulls[prop_type]
472    else:
473      # type == "other"
474      aliases = data[2]
475      name = aliases[0]
476      if name in _ignored_properties:
477        for alias in aliases:
478          _ignored_properties.add(alias)
479          _ignored_properties.add(NormPropName(alias))
480      else:
481        if name.endswith("ccc"):
482          _null_values[name] = 0
483        else:
484          _null_values[name] = null_value
485        prop = (prop_type, aliases, set(), {})
486        for alias in aliases:
487          _properties[alias] = prop
488          _properties[NormPropName(alias)] = prop
489  # Add provisional and ICU-specific properties we need.
490  # We add some in support of runtime API, even if we do not write
491  # data for them to ppucd.txt (e.g., lccc & tccc).
492  # We add others just to represent UCD data that contributes to
493  # some functionality, although Unicode has not "blessed" them
494  # as separate properties (e.g., Turkic_Case_Folding).
495
496  # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
497  name = "Turkic_Case_Folding"
498  _null_values[name] = ""
499  prop = ("String", [name, name], set(), {})
500  _properties[name] = prop
501  _properties[NormPropName(name)] = prop
502  # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
503  name = "Conditional_Case_Mappings"
504  _null_values[name] = ""
505  prop = ("Miscellaneous", [name, name], set(), {})
506  _properties[name] = prop
507  _properties[NormPropName(name)] = prop
508  # lccc = ccc of first cp in canonical decomposition.
509  _null_values["lccc"] = 0
510  ccc_prop = list(_properties["ccc"])
511  ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"]
512  prop = tuple(ccc_prop)
513  _properties["lccc"] = prop
514  _properties["Lead_Canonical_Combining_Class"] = prop
515  _properties["leadcanonicalcombiningclass"] = prop
516  # tccc = ccc of last cp in canonical decomposition.
517  _null_values["tccc"] = 0
518  ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"]
519  prop = tuple(ccc_prop)
520  _properties["tccc"] = prop
521  _properties["Trail_Canonical_Combining_Class"] = prop
522  _properties["trailcanonicalcombiningclass"] = prop
523  # Script_Extensions
524  if "scx" not in _properties:
525    _null_values["scx"] = ""
526    prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
527    _properties["scx"] = prop
528    _properties["Script_Extensions"] = prop
529    _properties["scriptextensions"] = prop
530  # General Category as a bit mask.
531  _null_values["gcm"] = "??"
532  gc_prop = _properties["gc"]
533  prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
534  _properties["gcm"] = prop
535  _properties["General_Category_Mask"] = prop
536  _properties["generalcategorymask"] = prop
537  # Various binary properties.
538  AddBinaryProperty("Sensitive", "Case_Sensitive")
539  AddBinaryProperty("nfdinert", "NFD_Inert")
540  AddBinaryProperty("nfkdinert", "NFKD_Inert")
541  AddBinaryProperty("nfcinert", "NFC_Inert")
542  AddBinaryProperty("nfkcinert", "NFKC_Inert")
543  AddBinaryProperty("segstart", "Segment_Starter")
544  # https://www.unicode.org/reports/tr51/#Emoji_Properties
545  AddBinaryProperty("Emoji", "Emoji")
546  AddBinaryProperty("EPres", "Emoji_Presentation")
547  AddBinaryProperty("EMod", "Emoji_Modifier")
548  AddBinaryProperty("EBase", "Emoji_Modifier_Base")
549  AddBinaryProperty("EComp", "Emoji_Component")
550  AddBinaryProperty("ExtPict", "Extended_Pictographic")
551  # https://www.unicode.org/reports/tr51/#Emoji_Sets
552  AddSingleNameBinaryProperty("Basic_Emoji")
553  AddSingleNameBinaryProperty("Emoji_Keycap_Sequence")
554  AddSingleNameBinaryProperty("RGI_Emoji_Modifier_Sequence")
555  AddSingleNameBinaryProperty("RGI_Emoji_Flag_Sequence")
556  AddSingleNameBinaryProperty("RGI_Emoji_Tag_Sequence")
557  AddSingleNameBinaryProperty("RGI_Emoji_ZWJ_Sequence")
558  AddSingleNameBinaryProperty("RGI_Emoji")
559  # C/POSIX character classes that do not have Unicode property [value] aliases.
560  # See uchar.h.
561  AddPOSIXBinaryProperty("alnum")
562  AddPOSIXBinaryProperty("blank")
563  AddPOSIXBinaryProperty("graph")
564  AddPOSIXBinaryProperty("print")
565  AddPOSIXBinaryProperty("xdigit")
566
567
568def ParsePropertyValueAliases(in_file):
569  global _binary_values
570  for data in ReadUCDLines(in_file, want_ranges=False,
571                           want_other=True, want_missing=True):
572    if data[0] == "missing":
573      SetDefaultValue(data[2][0], data[2][1])
574    else:
575      # type == "other"
576      fields = data[2]
577      pname = fields[0]
578      prop = GetProperty(pname)
579      if prop:
580        del fields[0]  # Only the list of aliases remains.
581        short_name = fields[0]
582        if short_name == "n/a":  # no short name
583          fields[0] = ""
584          short_name = fields[1]
585        prop[2].add(short_name)
586        values = prop[3]
587        for alias in fields:
588          if alias:
589            values[alias] = fields
590            values[NormPropName(alias)] = fields
591        if prop[0] == "Binary" and not _binary_values:
592          _binary_values = values
593  # Some of the @missing lines with non-null default property values
594  # are in files that we do not parse;
595  # either because the data for that property is easily
596  # (i.e., the @missing line would be the only reason to parse such a file)
597  # or because we compute the property at runtime,
598  # such as the Hangul_Syllable_Type.
599  if "dt" not in _defaults:  # DerivedDecompositionType.txt
600    _defaults["dt"] = "None"
601  if "nt" not in _defaults:  # DerivedNumericType.txt
602    _defaults["nt"] = "None"
603  if "hst" not in _defaults:  # HangulSyllableType.txt
604    _defaults["hst"] = "NA"
605  if "gc" not in _defaults:  # No @missing line in any .txt file?
606    _defaults["gc"] = "Cn"
607  # Copy the gc default value to gcm.
608  _defaults["gcm"] = _defaults["gc"]
609  # Add ISO 15924-only script codes.
610  # Only for the ICU script code API, not necessary for parsing the UCD.
611  script_prop = _properties["sc"]
612  short_script_names = script_prop[2]  # set
613  script_values = script_prop[3]  # dict
614  remove_scripts = []
615  for script in _scripts_only_in_iso15924:
616    if script in short_script_names:
617      remove_scripts.append(script)
618    else:
619      short_script_names.add(script)
620      # Do not invent a Unicode long script name before the UCD adds the script.
621      script_list = [script, script]  # [short, long]
622      script_values[script] = script_list
623      # Probably not necessary because
624      # we will not parse these scripts from the UCD:
625      script_values[NormPropName(script)] = script_list
626  if remove_scripts:
627    raise ValueError(
628        "remove %s from _scripts_only_in_iso15924" % remove_scripts)
629
630
631def ParseBlocks(in_file):
632  for data in ReadUCDLines(in_file, want_missing=True):
633    if data[0] == "missing":
634      SetDefaultValue("blk", data[2][0])
635    else:
636      # type == "range"
637      (start, end, name) = (data[2], data[3], data[4][1])
638      _blocks.append((start, end, {"blk": name}))
639      SetPropertyValue("blk", name, start, end)
640  _blocks.sort()
641  # Check for overlapping blocks.
642  prev_end = -1
643  for b in _blocks:
644    start = b[0]
645    end = b[1]
646    if prev_end >= start:
647      raise ValueError(
648          "block %04lX..%04lX %s overlaps with another " +
649          "ending at %04lX\n  %s\n" %
650          (start, end, b[2]["blk"], prev_end))
651    prev_end = end
652
653
654def ParseUnicodeData(in_file):
655  dt_prop = GetProperty("dt")
656  range_first_line = ""
657  range_first = -1
658  for data in ReadUCDLines(in_file, want_missing=True):
659    # type == "range"
660    (line, c, end, fields) = (data[1], data[2], data[3], data[4])
661    assert c == end
662    name = fields[1]
663    if name.startswith("<"):
664      if name.endswith(", First>"):
665        if range_first >= 0:
666          raise SyntaxError(
667              "error: unterminated range started at\n  %s\n" %
668              range_first_line)
669        range_first = c
670        range_first_line = line
671        continue
672      elif name.endswith(", Last>"):
673        if range_first < 0:
674          raise SyntaxError(
675              "error: range end without start at\n  %s\n" %
676              line)
677        elif range_first > c:
678          raise SyntaxError(
679              "error: range start/end out of order at\n  %s\n  %s\n" %
680              (range_first_line, line))
681        first_name = range_first_line.split(";")[1][1:-8]
682        name = name[1:-7]
683        if first_name != name:
684          raise SyntaxError(
685              "error: range start/end name mismatch at\n  %s\n  %s\n" %
686              (range_first_line, line))
687        end = c
688        c = range_first
689        range_first = -1
690        # Remember algorithmic name ranges.
691        if "Ideograph" in name:
692          prefix = "CJK UNIFIED IDEOGRAPH-"
693          if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-"
694          _alg_names_ranges.append([c, end, "han", prefix])
695        elif name == "Hangul Syllable":
696          _alg_names_ranges.append([c, end, "hangul"])
697        name = ""
698      else:
699        # Ignore non-names like <control>.
700        name = ""
701    props = {}
702    if name: props["na"] = name
703    props["gc"] = fields[2]
704    ccc = int(fields[3])
705    if ccc: props["ccc"] = ccc
706    props["bc"] = fields[4]
707    # Decomposition type & mapping.
708    dm = fields[5]
709    if dm:
710      if dm.startswith("<"):
711        dt_limit = dm.index(">")
712        dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit])
713        dm = dm[dt_limit + 1:].lstrip()
714      else:
715        dt = "Can"
716      props["dt"] = dt
717      props["dm"] = dm
718    # Numeric type & value.
719    decimal = fields[6]
720    digit = fields[7]
721    nv = fields[8]
722    if (decimal and decimal != nv) or (digit and digit != nv):
723      raise SyntaxError("error: numeric values differ at\n  %s\n" % line)
724    if nv:
725      # Map improper fractions to proper ones.
726      # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
727      # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
728      if nv == "2/12":
729        nv = "1/6"
730      elif nv == "3/12":
731        nv = "1/4"
732      elif nv == "4/12":
733        nv = "1/3"
734      elif nv == "6/12":
735        nv = "1/2"
736      elif nv == "8/12":
737        nv = "2/3"
738      elif nv == "9/12":
739        nv = "3/4"
740      elif nv == "10/12":
741        nv = "5/6"
742      props["nv"] = nv
743      props["nt"] = "De" if decimal else "Di" if digit else "Nu"
744    if fields[9] == "Y": props["Bidi_M"] = True
745    # ICU 49 and above does not support Unicode_1_Name any more.
746    # See ticket #9013.
747    # na1 = fields[10]
748    # if na1: props["na1"] = na1
749    # ISO_Comment is deprecated and has no values.
750    # isc = fields[11]
751    # if isc: props["isc"] = isc
752    # Simple case mappings.
753    suc = fields[12]
754    slc = fields[13]
755    stc = fields[14]
756    if suc: props["suc"] = suc
757    if slc: props["slc"] = slc
758    if stc: props["stc"] = stc
759    SetProps(c, end, props)
760  if range_first >= 0:
761    raise SyntaxError(
762        "error: unterminated range started at\n  %s\n" %
763        range_first_line)
764  # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
765  SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
766  _alg_names_ranges.sort()
767
768
769_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$")
770_names_h2_re = re.compile("@\t\t(.+)")
771_names_char_re = re.compile("([0-9a-fA-F]+)\t.+")
772
773def ParseNamesList(in_file):
774  pending_h2 = ""
775  for line in in_file:
776    line = line.strip()
777    if not line: continue
778    match = _names_h1_re.match(line)
779    if match:
780      pending_h2 = ""  # Drop a pending h2 when we get to an h1.
781      start = int(match.group(1), 16)
782      end = int(match.group(3), 16)
783      comment = match.group(2).replace(u"\xa0", " ")
784      _h1.append((start, end, comment))
785      continue
786    match = _names_h2_re.match(line)
787    if match:
788      pending_h2 = match.group(1).replace(u"\xa0", " ")
789      continue
790    if pending_h2:
791      match = _names_char_re.match(line)
792      if match:
793        c = int(match.group(1), 16)
794        _h2.append((c, pending_h2))
795        pending_h2 = ""
796  _h1.sort()
797  _h2.sort()
798
799
800def ParseNamedProperties(in_file):
801  """Parses a .txt file where the first column is a code point range
802  and the second column is a property name.
803  Sets binary properties to True,
804  and other properties to the values in the third column."""
805  for data in ReadUCDLines(in_file, want_missing=True):
806    if data[0] == "missing":
807      SetDefaultValue(data[2][0], data[2][1])
808    else:
809      # type == "range"
810      if len(data[4]) == 2:
811        SetBinaryPropertyToTrue(data[4][1], data[2], data[3])
812      else:
813        SetPropertyValue(data[4][1], data[4][2], data[2], data[3])
814
815
816def ParseOneProperty(in_file, pname):
817  """Parses a .txt file where the first column is a code point range
818  and the second column is the value of a known property."""
819  prop = GetProperty(pname)
820  for data in ReadUCDLines(in_file, want_missing=True):
821    if data[0] == "missing":
822      SetDefaultValue(pname, data[2][0])
823    else:
824      # type == "range"
825      SetPropValue(prop, data[4][1], data[2], data[3])
826
827
828def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg")
829def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age")
830def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc")
831def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
832def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
833def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
834def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
835def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
836def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
837def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
838def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
839def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx")
840def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB")
841def ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo")
842def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB")
843
844
845def DoSetNameAlias(alias, start, end, c_props):
846  if "Name_Alias" in c_props:
847    c_props["Name_Alias"] += ',' + alias
848  else:
849    c_props["Name_Alias"] = alias
850
851
852def ParseNameAliases(in_file):
853  """Parses Name_Alias from NameAliases.txt.
854  A character can have multiple aliases.
855
856  In Unicode 6.0, there are two columns,
857  with a name correction in the second column.
858
859  In Unicode 6.1, there are three columns.
860  The second contains an alias, the third its type.
861  The documented types are:
862    correction, control, alternate, figment, abbreviation
863
864  This function does not sort the types, assuming they appear in this order."""
865  for data in ReadUCDLines(in_file):
866    start = data[2]
867    end = data[3]
868    if start != end:
869      raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" %
870                       (start, end))
871    fields = data[4]
872    if len(fields) == 2:
873      alias = "correction=" + fields[1]
874    else:
875      alias = fields[2] + '=' + fields[1]
876    update = (NeedToSetAlways, DoSetNameAlias, alias)
877    UpdateProps(start, end, update)
878
879
880def NeedToSetNumericValue(nv, start, end, c_props):
881  c_nv = c_props.get("nv")
882  if c_nv == None:
883    # DerivedNumericValues.txt adds a Numeric_Value.
884    assert "nt" not in c_props
885    return True
886  if nv != c_nv:
887    raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
888                     "but DerivedNumericValues.txt has nv=%s") %
889                     (c_nv, start, end, nv))
890  return False
891
892
893def DoSetNumericValue(nv, start, end, c_props):
894  c_props.update({"nt": "Nu", "nv": nv})
895
896
897def ParseDerivedNumericValues(in_file):
898  """Parses DerivedNumericValues.txt.
899  For most characters, the numeric type & value were parsed previously
900  from UnicodeData.txt but that does not show the values for Han characters.
901  Here we check that values match those from UnicodeData.txt
902  and add new ones."""
903  # Ignore the @missing line which has an incorrect number of fields,
904  # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1).
905  # Also, "NaN" is just the Numeric null value anyway.
906  for data in ReadUCDLines(in_file):
907    # Conditional update to the numeric value in the 4th field.
908    update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3])
909    UpdateProps(data[2], data[3], update)
910
911
912def ParseCaseFolding(in_file):
913  for data in ReadUCDLines(in_file, want_missing=True):
914    if data[0] == "missing":
915      assert data[2][0] == "C"  # common to scf & cf
916      SetDefaultValue("scf", data[2][1])
917      SetDefaultValue("cf", data[2][1])
918    else:
919      # type == "range"
920      start = data[2]
921      end = data[3]
922      status = data[4][1]
923      mapping = data[4][2]
924      assert status in "CSFT"
925      if status == "C":
926        SetProps(start, end, {"scf": mapping, "cf": mapping})
927      elif status == "S":
928        SetPropertyValue("scf", mapping, start, end)
929      elif status == "F":
930        SetPropertyValue("cf", mapping, start, end)
931      else:  # status == "T"
932        SetPropertyValue("Turkic_Case_Folding", mapping, start, end)
933
934
935def DoSetConditionalCaseMappings(ccm, start, end, c_props):
936  if "Conditional_Case_Mappings" in c_props:
937    c_props["Conditional_Case_Mappings"] += ',' + ccm
938  else:
939    c_props["Conditional_Case_Mappings"] = ccm
940
941
942def ParseSpecialCasing(in_file):
943  for data in ReadUCDLines(in_file, want_missing=True):
944    if data[0] == "missing":
945      SetDefaultValue("lc", data[2][0])
946      SetDefaultValue("tc", data[2][1])
947      SetDefaultValue("uc", data[2][2])
948    else:
949      # type == "range"
950      start = data[2]
951      end = data[3]
952      fields = data[4]
953      if len(fields) < 5 or not fields[4]:
954        # Unconditional mappings.
955        SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]})
956      else:
957        # Conditional_Case_Mappings
958        ccm = (fields[4] + ":lc=" + fields[1] +
959               "&tc=" + fields[2] + "&uc=" + fields[3])
960        update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm)
961        UpdateProps(start, end, update)
962
963
964def ParseBidiBrackets(in_file):
965  for data in ReadUCDLines(in_file, want_missing=True):
966    if data[0] == "missing":
967      SetDefaultValue("bpt", data[2][1])
968    else:
969      # type == "range"
970      start = data[2]
971      end = data[3]
972      assert start == end
973      mapping = data[4][1]
974      bracket_type = data[4][2]
975      SetProps(start, end, {"bpb": mapping, "bpt": bracket_type})
976
977# Postprocessing ----------------------------------------------------------- ***
978
979def PrintedSize(pname, value):
980  if isinstance(value, bool):
981    if value:
982      return len(pname) + 1  # ";pname"
983    else:
984      return len(pname) + 2  # ";-pname"
985  else:
986    return len(pname) + len(str(value)) + 2  # ";pname=value"
987
988
989def CompactBlock(b, i):
990  assert b[0] == _starts[i]
991  b_props = b[2]  # Normally just blk from Blocks.txt.
992  # b_props["blk"] has not been canonicalized yet.
993  b_props["blk"] = _props[i]["blk"]
994  orig_i = i
995  # Count the number of occurrences of each property's value in this block.
996  # To minimize the output, count the number of assigned ranges,
997  # not the number of code points.
998  num_ranges = 0
999  prop_counters = {}
1000  if "gc" in b_props:
1001    b_is_unassigned = b_props["gc"] == "Cn"  # Unreachable with normal data.
1002  else:
1003    b_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
1004  while True:
1005    start = _starts[i]
1006    if start > b[1]: break
1007    props = _props[i]
1008    if "gc" in props:
1009      is_unassigned = props["gc"] == "Cn"
1010    else:
1011      is_unassigned = b_is_unassigned
1012    if is_unassigned:
1013      # Compact an unassigned range inside the block and
1014      # mark it to be written with "unassigned".
1015      # It falls back to default properties, not block properties,
1016      # except for the blk=Block property.
1017      assert props["blk"] == b_props["blk"]
1018      del props["blk"]
1019      for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
1020        if props[pname] == _null_or_defaults[pname]: del props[pname]
1021      # What remains are unusual default values for unassigned code points.
1022      # For example, bc=R or lb=ID.
1023      # See http://www.unicode.org/reports/tr44/#Default_Values_Table
1024      props["unassigned"] = True
1025    else:
1026      for (pname, value) in props.items():
1027        if pname in prop_counters:
1028          counter = prop_counters[pname]
1029        else:
1030          counter = {_null_or_defaults[pname]: num_ranges}
1031          prop_counters[pname] = counter
1032        if value in counter:
1033          counter[value] += 1
1034        else:
1035          counter[value] = 1
1036      # Also count default values for properties that do not occur in a range.
1037      for pname in prop_counters:
1038        if pname not in props:
1039          counter = prop_counters[pname]
1040          value = _null_or_defaults[pname]
1041          counter[value] += 1
1042      num_ranges += 1
1043      # Invariant: For each counter, the sum of counts must equal num_ranges.
1044    i += 1
1045  # For each property that occurs within this block,
1046  # set the value that reduces the file size the most as a block property value.
1047  # This is usually the most common value.
1048  for (pname, counter) in prop_counters.items():
1049    default_value = _null_or_defaults[pname]
1050    default_size = PrintedSize(pname, default_value) * counter[default_value]
1051    max_value = None
1052    max_count = 0
1053    max_savings = 0
1054    for (value, count) in counter.items():
1055      if value != default_value and count > 1:
1056        # Does the file get smaller by setting the block default?
1057        # We save writing the block value as often as it occurs,
1058        # minus once for writing it for the block,
1059        # minus writing the default value instead.
1060        savings = PrintedSize(pname, value) * (count - 1) - default_size
1061        # For two values with the same savings, pick the one that compares lower,
1062        # to make this deterministic (avoid flip-flopping).
1063        if (savings > max_savings or
1064            (savings > 0 and savings == max_savings and value < max_value)):
1065          max_value = value
1066          max_count = count
1067          max_savings = savings
1068    # Do not compress uncompressible properties,
1069    # with an exception for many empty-string values in a block
1070    # (NFKC_CF='' for tags and variation selectors).
1071    if (max_savings > 0 and
1072        ((pname not in _uncompressible_props) or
1073          (max_value == '' and max_count >= 12))):
1074      b_props[pname] = max_value
1075  # For each range and property, remove the default+block value
1076  # but set the default value if that property was not set
1077  # (i.e., it used to inherit the default value).
1078  b_defaults = _null_or_defaults.copy()
1079  b_defaults.update(b_props)
1080  i = orig_i
1081  while True:
1082    start = _starts[i]
1083    if start > b[1]: break
1084    props = _props[i]
1085    if "unassigned" not in props:
1086      # Compact an assigned range inside the block.
1087      for pname in prop_counters:
1088        if pname in props:
1089          if props[pname] == b_defaults[pname]: del props[pname]
1090        elif pname in b_props:
1091          # b_props only has non-default values.
1092          # Set the default value if it used to be inherited.
1093          props[pname] = _null_or_defaults[pname]
1094      # If there is only one assigned range, then move all of its properties
1095      # to the block.
1096      if num_ranges == 1:
1097        b_props.update(props)
1098        props.clear()
1099    i += 1
1100  # Return the _starts index of the first range after this block.
1101  return i
1102
1103
1104def CompactNonBlock(limit, i):
1105  """Remove default property values from between-block ranges."""
1106  default_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
1107  while True:
1108    start = _starts[i]
1109    if start >= limit: break
1110    props = _props[i]
1111    if "gc" in props:
1112      is_unassigned = props["gc"] == "Cn"
1113    else:
1114      is_unassigned = default_is_unassigned
1115    for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
1116      if props[pname] == _null_or_defaults[pname]: del props[pname]
1117    assert "blk" not in props
1118    # If there are no props left, then nothing will be printed.
1119    # Otherwise, add "unassigned" for more obvious output.
1120    if props and is_unassigned:
1121      props["unassigned"] = True
1122    i += 1
1123  # Return the _starts index of the first range after this block.
1124  return i
1125
1126
1127def CompactBlocks():
1128  """Optimizes block properties.
1129  Sets properties on blocks to the most commonly used values,
1130  and removes default+block values from code point properties."""
1131  # Ensure that there is a boundary in _starts for each block
1132  # so that the simple mixing method below works.
1133  for b in _blocks:
1134    AddBoundary(b[0])
1135    limit = b[1] + 1
1136    if limit <= 0x10ffff: AddBoundary(limit)
1137  # Walk through ranges and blocks together.
1138  i = 0
1139  for b in _blocks:
1140    b_start = b[0]
1141    if _starts[i] < b_start:
1142      i = CompactNonBlock(b_start, i)
1143    i = CompactBlock(b, i)
1144  CompactNonBlock(0x110000, i)
1145
1146# Output ------------------------------------------------------------------- ***
1147
1148def AppendRange(fields, start, end):
1149  if start == end:
1150    fields.append("%04lX" % start)
1151  else:
1152    fields.append("%04lX..%04lX" % (start, end))
1153
1154
1155def AppendProps(fields, props):
1156  # Sort property names (props keys) by their normalized forms
1157  # and output properties in that order.
1158  for pname in sorted(props, key=NormPropName):
1159    value = props[pname]
1160    if isinstance(value, bool):
1161      if not value: pname = "-" + pname
1162      fields.append(pname)
1163    else:
1164      fields.append("%s=%s" % (pname, value))
1165
1166
1167def WriteFieldsRangeProps(fields, start, end, props, out_file):
1168  AppendRange(fields, start, end)
1169  AppendProps(fields, props)
1170  out_file.write(";".join(fields))
1171  out_file.write("\n")
1172
1173
1174def EscapeNonASCII(s):
1175  i = 0
1176  while i < len(s):
1177    c = ord(s[i])
1178    if c <= 0x7f:
1179      i = i + 1
1180    else:
1181      if c <= 0xffff:
1182        esc = u"\\u%04X" % c
1183      else:
1184        esc = u"\\U%08X" % c
1185      s = s[:i] + esc + s[i+1:]
1186      i = i + len(esc)
1187  return s
1188
1189
1190def WritePreparsedUCD(out_file):
1191  out_file.write("""# Preparsed UCD generated by ICU preparseucd.py
1192# Copyright (C) 1991 and later: Unicode, Inc. and others.
1193# License & terms of use: http://www.unicode.org/copyright.html
1194""");
1195  out_file.write("ucd;%s\n\n" % _ucd_version)
1196  # Sort property names (props keys) by their normalized forms
1197  # and output properties in that order.
1198  pnames = sorted(_null_values, key=NormPropName)
1199  for pname in pnames:
1200    prop = _properties[pname]
1201    out_file.write(";".join(["property", prop[0]] + prop[1]))
1202    out_file.write("\n")
1203  out_file.write("\n")
1204  out_file.write(";".join(["binary"] + _binary_values["N"]))
1205  out_file.write("\n")
1206  out_file.write(";".join(["binary"] + _binary_values["Y"]))
1207  out_file.write("\n")
1208  for pname in pnames:
1209    prop = _properties[pname]
1210    short_names = prop[2]
1211    if short_names and prop[0] != "Binary":
1212      for name in sorted(short_names):
1213        out_file.write(";".join(["value", prop[1][0]] + prop[3][name]))
1214        out_file.write("\n")
1215  out_file.write("\n")
1216  # Ensure that there is a boundary in _starts for each
1217  # range of data we mix into the output,
1218  # so that the simple mixing method below works.
1219  for b in _blocks: AddBoundary(b[0])
1220  for r in _alg_names_ranges: AddBoundary(r[0])
1221  for h in _h1: AddBoundary(h[0])
1222  for h in _h2: AddBoundary(h[0])
1223  # Write the preparsed data. ppucd.txt = preparsed UCD
1224  # Syntax: http://site.icu-project.org/design/props/ppucd
1225  WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file)
1226  i_blocks = 0
1227  i_alg = 0
1228  i_h1 = 0
1229  i_h2 = 0
1230  b_end = -1
1231  for i in range(len(_starts) - 1):
1232    start = _starts[i]
1233    end = _starts[i + 1] - 1
1234    # Block with default properties.
1235    if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]:
1236      b = _blocks[i_blocks]
1237      b_end = b[1]
1238      WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file)
1239      i_blocks += 1
1240    # NamesList h1 heading (for [most of] a block).
1241    if i_h1 < len(_h1) and start == _h1[i_h1][0]:
1242      h = _h1[i_h1]
1243      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
1244      i_h1 += 1
1245    # Algorithmic-names range.
1246    if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
1247      r = _alg_names_ranges[i_alg]
1248      fields = ["algnamesrange"]
1249      AppendRange(fields, r[0], r[1])
1250      fields.extend(r[2:])
1251      out_file.write(";".join(fields))
1252      out_file.write("\n")
1253      i_alg += 1
1254    # NamesList h2 heading.
1255    if i_h2 < len(_h2) and start == _h2[i_h2][0]:
1256      out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
1257      i_h2 += 1
1258    # Code point/range data.
1259    props = _props[i]
1260    # Omit ranges with only default+block properties.
1261    if props:
1262      if start > b_end and b_end >= 0:
1263        # First range with values after the last block.
1264        # Separate it visually from the block lines.
1265        out_file.write("\n# No block\n")
1266        b_end = -1
1267      if "unassigned" in props:
1268        # Do not output "unassigned" as a property.
1269        del props["unassigned"]
1270        line_type = "unassigned"
1271      else:
1272        line_type = "cp"
1273      WriteFieldsRangeProps([line_type], start, end, props, out_file)
1274
1275# Write Normalizer2 input files -------------------------------------------- ***
1276# Ported from gennorm/store.c.
1277
1278def WriteAllCC(out_file):
1279  out_file.write("# Canonical_Combining_Class (ccc) values\n");
1280  prev_start = 0
1281  prev_cc = 0
1282  for i in range(len(_starts)):
1283    start = _starts[i]
1284    props = _props[i]
1285    cc = props.get("ccc")
1286    if not cc: cc = 0
1287    if prev_cc != cc:
1288      if prev_cc != 0:
1289        last_code_point = start - 1
1290        if prev_start == last_code_point:
1291          out_file.write("%04X:%d\n" % (last_code_point, prev_cc))
1292        else:
1293          out_file.write("%04X..%04X:%d\n" %
1294                         (prev_start, last_code_point, prev_cc))
1295      prev_start = start
1296      prev_cc = cc
1297
1298
1299def HasMapping(c):
1300  props = GetProps(c)
1301  dt = props.get("dt")
1302  return dt and dt != "None"
1303
1304
1305def HasOneWayMapping(c):
1306  while True:
1307    props = GetProps(c)
1308    dt = props.get("dt")
1309    if not dt or dt == "None":
1310      return False  # no mapping
1311    elif dt == "Can":
1312      # The canonical decomposition is a one-way mapping if
1313      # - it does not map to exactly two code points
1314      # - c has ccc!=0
1315      # - c has the Composition_Exclusion property
1316      # - its starter has a one-way mapping (loop for this)
1317      # - its non-starter decomposes
1318      nfd = props["dm"].split()
1319      if (len(nfd) != 2 or
1320          props.get("ccc") or
1321          props.get("Comp_Ex") or
1322          HasMapping(int(nfd[1], 16))):
1323        return True
1324      c = int(nfd[0], 16)  # continue
1325    else:
1326      # c has a compatibility mapping.
1327      return True
1328
1329
1330_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others.
1331# License & terms of use: http://www.unicode.org/copyright.html
1332# Copyright (C) 1999-2016, International Business Machines
1333# Corporation and others.  All Rights Reserved.
1334#
1335"""
1336
1337def WriteNorm2NFCTextFile(path):
1338  global _data_file_copyright
1339  with open(os.path.join(path, "nfc.txt"), "w") as out_file:
1340    out_file.write(
1341        _data_file_copyright + """# file name: nfc.txt
1342#
1343# machine-generated by ICU preparseucd.py
1344#
1345# Complete data for Unicode NFC normalization.
1346
1347* Unicode """ + _ucd_version + """
1348
1349""")
1350    WriteAllCC(out_file)
1351    out_file.write("\n# Canonical decomposition mappings\n")
1352    for i in range(len(_starts) - 1):
1353      start = _starts[i]
1354      end = _starts[i + 1] - 1
1355      props = _props[i]
1356      dm = props.get("dm")
1357      if dm and dm[0] != '<' and props["dt"] == "Can":
1358        assert start == end
1359        # The Comp_Ex=Full_Composition_Exclusion property tells us
1360        # whether the canonical decomposition round-trips.
1361        separator = '>' if props.get("Comp_Ex") else '='
1362        out_file.write("%04X%s%s\n" % (start, separator, dm))
1363
1364
1365def WriteNorm2NFKCTextFile(path):
1366  global _data_file_copyright
1367  with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
1368    out_file.write(
1369        _data_file_copyright + """# file name: nfkc.txt
1370#
1371# machine-generated by ICU preparseucd.py
1372#
1373# Data for Unicode NFKC normalization.
1374# This file contains only compatibility decomposition mappings,
1375# plus those canonical decompositions that change from NFC round-trip mappings
1376# to NFKC one-way mappings.
1377# Use this file as the second gennorm2 input file after nfc.txt.
1378
1379* Unicode """ + _ucd_version + """
1380
1381""")
1382    for i in range(len(_starts) - 1):
1383      start = _starts[i]
1384      end = _starts[i + 1] - 1
1385      props = _props[i]
1386      dm = props.get("dm")
1387      if dm and dm[0] != '<':
1388        assert start == end
1389        if props["dt"] != "Can":
1390          # Compatibility decomposition.
1391          out_file.write("%04X>%s\n" % (start, dm))
1392        elif not props.get("Comp_Ex") and HasOneWayMapping(start):
1393          # NFC round-trip mapping turns into NFKC one-way mapping.
1394          out_file.write("%04X>%s  # NFC round-trip, NFKC one-way\n" %
1395                         (start, dm))
1396
1397
1398def WriteNorm2NFKC_CFTextFile(path):
1399  global _data_file_copyright
1400  with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
1401    out_file.write(
1402        _data_file_copyright + """# file name: nfkc_cf.txt
1403#
1404# machine-generated by ICU preparseucd.py
1405#
1406# This file contains the Unicode NFKC_CF mappings,
1407# extracted from the UCD file DerivedNormalizationProps.txt,
1408# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
1409# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
1410
1411""")
1412    out_file.write("* Unicode " + _ucd_version + "\n\n")
1413    prev_start = 0
1414    prev_end = 0
1415    prev_nfkc_cf = None
1416    for i in range(len(_starts) - 1):
1417      start = _starts[i]
1418      end = _starts[i + 1] - 1
1419      props = _props[i]
1420      nfkc_cf = props.get("NFKC_CF")
1421      # Merge with the previous range if possible,
1422      # or remember this range for merging.
1423      if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
1424        prev_end = end
1425      else:
1426        if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
1427          if prev_start == prev_end:
1428            out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
1429          else:
1430            out_file.write("%04X..%04X>%s\n" %
1431                           (prev_start, prev_end, prev_nfkc_cf))
1432        prev_start = start
1433        prev_end = end
1434        prev_nfkc_cf = nfkc_cf
1435
1436
1437def WriteNorm2(path):
1438  WriteNorm2NFCTextFile(path)
1439  WriteNorm2NFKCTextFile(path)
1440  WriteNorm2NFKC_CFTextFile(path)
1441
1442# UTS #46 Normalizer2 input file ------------------------------------------- ***
1443
1444_idna_replacements = [
1445  # Several versions of avoiding circular FFFD>FFFD mappings,
1446  # depending on the version of the input file.
1447  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
1448  (re.compile(r"\.\.FFFD"), "..FFFC"),
1449  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
1450  # Since we switch between checking and not checking for STD3 character
1451  # restrictions at runtime, checking the non-LDH ASCII characters in code,
1452  # we treat these values here like their regular siblings.
1453  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
1454  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
1455  # For UTS #46, we do not care about "not valid in IDNA2008".
1456  (re.compile(r"; *; NV8 +"), ""),
1457  # ICU 63+ normalization no longer allows mappings for surrogate code points,
1458  # and the UTS #46 code handles them instead.
1459  (re.compile(r"^D800..DFFF    ; disallowed"), r"# D800..DFFF disallowed in code"),
1460  # Normal transformations.
1461  (re.compile(r"; disallowed"), ">FFFD"),
1462  (re.compile(r"; ignored"), ">"),
1463  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
1464  (re.compile(r"; mapped +; "), ">"),
1465  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
1466]
1467
1468def IdnaToUTS46TextFile(s, t):
1469  """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
1470  # Different input/output file names.
1471  dest_path = os.path.dirname(t)
1472  t = os.path.join(dest_path, "uts46.txt")
1473  # TODO: With Python 2.7+, combine the two with statements into one.
1474  with open(s, "r") as in_file:
1475    with open(t, "w") as out_file:
1476      out_file.write("# Original file:\n")
1477      for line in in_file:
1478        orig_line = line
1479        if line.startswith("# For documentation"):
1480          out_file.write(line)
1481          out_file.write(r"""
1482# ================================================
1483# This file has been reformatted into syntax for the
1484# gennorm2 Normalizer2 data generator tool.
1485#
1486# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
1487# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
1488# "disallowed" lines map to U+FFFD.
1489# "ignored" lines map to an empty string.
1490#
1491# Characters disallowed under STD3 rules are treated as valid or mapped;
1492# they are handled in code.
1493# Deviation characters are also handled in code.
1494#
1495# Use this file as the second gennorm2 input file after nfc.txt.
1496# ================================================
1497""")
1498          continue
1499        if line[0] in "#\r\n":
1500          out_file.write(line)
1501          continue
1502        for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
1503        # Align inline comments at column 40.
1504        comment_pos = line.find("#", 1)
1505        if comment_pos < 40:
1506          line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
1507                  line[comment_pos:])
1508        elif comment_pos > 40:
1509          space_pos = comment_pos
1510          while space_pos > 0 and line[space_pos - 1] == ' ':
1511            space_pos = space_pos - 1
1512          if space_pos < 40:
1513            # Fewer than 40 characters before the comment:
1514            # Align comments at column 40.
1515            line = line[:40] + line[comment_pos:]
1516          else:
1517            # 40 or more characters before the comment:
1518            # Keep one space between contents and comment.
1519            line = line[:space_pos] + " " + line[comment_pos:]
1520        # Write the modified line.
1521        out_file.write(line)
1522        if "..FFFF" in orig_line and "..FFFC" in line:
1523          out_file.write("FFFE..FFFF    >FFFD\n");
1524  return t
1525
1526# Preprocessing ------------------------------------------------------------ ***
1527
1528_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
1529_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
1530
1531def CopyAndStripWithOptionalMerge(s, t, do_merge):
1532  # TODO: We do not seem to need the do_merge argument and logic any more.
1533  with open(s, "r") as in_file, open(t, "w") as out_file:
1534    first = -1  # First code point with first_data.
1535    last = -1  # Last code point with first_data.
1536    first_data = ""  # Common data for code points [first..last].
1537    for line in in_file:
1538      match = _strip_re.match(line)
1539      if match:
1540        line = match.group(1)
1541      else:
1542        line = line.rstrip()
1543      if do_merge:
1544        match = _code_point_re.match(line)
1545        if match:
1546          c = int(match.group(1), 16)
1547          data = line[match.end() - 1:]
1548        else:
1549          c = -1
1550          data = ""
1551        if last >= 0 and (c != (last + 1) or data != first_data):
1552          # output the current range
1553          if first == last:
1554            out_file.write("%04X%s\n" % (first, first_data))
1555          else:
1556            out_file.write("%04X..%04X%s\n" % (first, last, first_data))
1557          first = -1
1558          last = -1
1559          first_data = ""
1560        if c < 0:
1561          # no data on this line, output as is
1562          out_file.write(line)
1563          out_file.write("\n")
1564        else:
1565          # data on this line, store for possible range compaction
1566          if last < 0:
1567            # set as the first line in a possible range
1568            first = c
1569            last = c
1570            first_data = data
1571          else:
1572            # must be c == (last + 1) and data == first_data
1573            # because of previous conditions
1574            # continue with the current range
1575            last = c
1576      else:
1577        # Only strip, don't merge: just output the stripped line.
1578        out_file.write(line)
1579        out_file.write("\n")
1580    if do_merge and last >= 0:
1581      # output the last range in the file
1582      if first == last:
1583        out_file.write("%04X%s\n" % (first, first_data))
1584      else:
1585        out_file.write("%04X..%04X%s\n" % (first, last, first_data))
1586      first = -1
1587      last = -1
1588      first_data = ""
1589    out_file.flush()
1590  return t
1591
1592
1593def CopyAndStrip(s, t):
1594  """Copies a file and removes comments behind data lines but not in others."""
1595  return CopyAndStripWithOptionalMerge(s, t, False)
1596
1597
1598def CopyAndStripAndMerge(s, t):
1599  """Copies and strips a file and merges lines.
1600
1601  Copies a file, removes comments, and
1602  merges lines with adjacent code point ranges and identical per-code point
1603  data lines into one line with range syntax.
1604  """
1605  return CopyAndStripWithOptionalMerge(s, t, True)
1606
1607
1608def CopyOnly(s, t):
1609  shutil.copy(s, t)
1610  return t
1611
1612
1613def DontCopy(s, t):
1614  return s
1615
1616
1617# Each _files value is a
1618# (preprocessor, dest_folder, parser, order) tuple
1619# where all fields except the preprocessor are optional.
1620# After the initial preprocessing (copy/strip/merge),
1621# if a parser is specified, then a tuple is added to _files_to_parse
1622# at index "order" (default order 9).
1623# An explicit order number is set only for files that must be parsed
1624# before others.
1625_files = {
1626  "BidiBrackets.txt": (DontCopy, ParseBidiBrackets),
1627  "BidiMirroring.txt": (DontCopy, ParseBidiMirroring),
1628  "BidiTest.txt": (CopyOnly, "testdata"),
1629  "Blocks.txt": (DontCopy, ParseBlocks),
1630  "CaseFolding.txt": (CopyOnly, ParseCaseFolding),
1631  "DerivedAge.txt": (DontCopy, ParseDerivedAge),
1632  "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass),
1633  "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties),
1634  "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup),
1635  "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType),
1636  "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties),
1637  "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues),
1638  "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
1639  "emoji-data.txt": (DontCopy, ParseNamedProperties),
1640  "emoji-sequences.txt": (CopyOnly,),
1641  "emoji-zwj-sequences.txt": (CopyOnly,),
1642  "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
1643  "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"),
1644  "IdnaTestV2.txt": (CopyOnly, "testdata"),
1645  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
1646  "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
1647  "LineBreak.txt": (DontCopy, ParseLineBreak),
1648  "LineBreakTest.txt": (CopyOnly, "testdata"),
1649  "NameAliases.txt": (DontCopy, ParseNameAliases),
1650  "NamesList.txt": (DontCopy, ParseNamesList),
1651  "NormalizationCorrections.txt": (CopyOnly,),  # Only used in gensprep.
1652  "NormalizationTest.txt": (CopyAndStrip,),
1653  "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0),
1654  "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1),
1655  "PropList.txt": (DontCopy, ParseNamedProperties),
1656  "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak),
1657  "SentenceBreakTest.txt": (CopyOnly, "testdata"),
1658  "Scripts.txt": (DontCopy, ParseScripts),
1659  "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions),
1660  "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
1661  "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
1662  "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation),
1663  "WordBreakProperty.txt": (DontCopy, ParseWordBreak),
1664  "WordBreakTest.txt": (CopyOnly, "testdata"),
1665  # From www.unicode.org/Public/idna/<version>/
1666  "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
1667}
1668
1669# List of lists of files to be parsed in order.
1670# Inner lists contain (basename, path, parser) tuples.
1671_files_to_parse = [[], [], [], [], [], [], [], [], [], []]
1672
1673# Get the standard basename from a versioned filename.
1674# For example, match "UnicodeData-6.1.0d8.txt"
1675# so we can turn it into "UnicodeData.txt".
1676_file_version_re = re.compile("([a-zA-Z0-9_-]+)" +
1677                              "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" +
1678                              "(\\.[a-z]+)$")
1679
1680def PreprocessFiles(source_files, icu4c_src_root):
1681  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
1682  norm2_path = os.path.join(unidata_path, "norm2")
1683  testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata")
1684  folder_to_path = {
1685    "unidata": unidata_path,
1686    "norm2": norm2_path,
1687    "testdata": testdata_path
1688  }
1689  files_processed = set()
1690  for source_file in source_files:
1691    (folder, basename) = os.path.split(source_file)
1692    match = _file_version_re.match(basename)
1693    if match:
1694      new_basename = match.group(1) + match.group(2)
1695      if new_basename != basename:
1696        print("Removing version suffix from " + source_file)
1697        # ... so that we can easily compare UCD files.
1698        new_source_file = os.path.join(folder, new_basename)
1699        shutil.move(source_file, new_source_file)
1700        basename = new_basename
1701        source_file = new_source_file
1702    if basename in _files:
1703      print("Preprocessing %s" % basename)
1704      if basename in files_processed:
1705        raise Exception("duplicate file basename %s!" % basename)
1706      files_processed.add(basename)
1707      value = _files[basename]
1708      preprocessor = value[0]
1709      if len(value) >= 2 and isinstance(value[1], (str)):
1710        # The value was [preprocessor, dest_folder, ...], leave [...].
1711        dest_folder = value[1]
1712        value = value[2:]
1713      else:
1714        # The value was [preprocessor, ...], leave [...].
1715        dest_folder = "unidata"
1716        value = value[1:]
1717      dest_path = folder_to_path[dest_folder]
1718      if not os.path.exists(dest_path): os.makedirs(dest_path)
1719      dest_basename = basename
1720      # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt.
1721      if basename.endswith("-cldr.txt"):
1722        dest_basename = basename[:-9] + basename[-4:]
1723      dest_file = os.path.join(dest_path, dest_basename)
1724      parse_file = preprocessor(source_file, dest_file)
1725      if value:
1726        order = 9 if len(value) < 2 else value[1]
1727        _files_to_parse[order].append((basename, parse_file, value[0]))
1728
1729# Character names ---------------------------------------------------------- ***
1730
1731# TODO: Turn this script into a module that
1732# a) gives access to the parsed data
1733# b) has a PreparseUCD(ucd_root, icu4c_src_root) function
1734# c) has a ParsePreparsedUCD(filename) function
1735# d) has a WritePreparsedUCD(filename) function
1736# and then use it from a new script for names.
1737# Some more API:
1738# - generator GetRangesAndProps() -> (start, end, props)*
1739
1740def IncCounter(counters, key, inc=1):
1741  if key in counters:
1742    counters[key] += inc
1743  else:
1744    counters[key] = inc
1745
1746
1747endings = (
1748  # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz.
1749  "PHASE-",
1750  "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ",
1751  "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ",
1752  "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ",
1753  "ACROPHONIC ", "HIEROGLYPH ",
1754  "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ",
1755  "PUNCTUATION ", "SIGN ", "SYMBOL ",
1756  "TILE ", "CARD ", "FACE ",
1757  "ACCENT ", "POINT ",
1758  # List SIGN before VOWEL to catch "vowel sign".
1759  "VOWEL ", "TONE ", "RADICAL ",
1760  # For names of math symbols,
1761  # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A
1762  "SCRIPT ", "FRAKTUR ", "MONOSPACE ",
1763  "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ",
1764  "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ",
1765  # BRAILLE PATTERN DOTS-xyz
1766  "DOTS-",
1767  "SELECTOR ", "SELECTOR-"
1768)
1769
1770def SplitName(name, tokens):
1771  start = 0
1772  for e in endings:
1773    i = name.find(e)
1774    if i >= 0:
1775      start = i + len(e)
1776      token = name[:start]
1777      IncCounter(tokens, token)
1778      break
1779  for i in range(start, len(name)):
1780    c = name[i]
1781    if c == ' ' or c == '-':
1782      token = name[start:i + 1]
1783      IncCounter(tokens, token)
1784      start = i + 1
1785  IncCounter(tokens, name[start:])
1786
1787
1788def PrintNameStats():
1789  # TODO: This name analysis code is out of date.
1790  # It needs to consider the multi-type Name_Alias values.
1791  name_pnames = ("na", "na1", "Name_Alias")
1792  counts = {}
1793  for pname in name_pnames:
1794    counts[pname] = 0
1795  total_lengths = counts.copy()
1796  max_length = 0
1797  max_per_cp = 0
1798  name_chars = set()
1799  num_digits = 0
1800  token_counters = {}
1801  char_counters = {}
1802  for i in range(len(_starts) - 1):
1803    start = _starts[i]
1804    # end = _starts[i + 1] - 1
1805    props = _props[i]
1806    per_cp = 0
1807    for pname in name_pnames:
1808      if pname in props:
1809        counts[pname] += 1
1810        name = props[pname]
1811        total_lengths[pname] += len(name)
1812        name_chars |= set(name)
1813        if len(name) > max_length: max_length = len(name)
1814        per_cp += len(name) + 1
1815        if per_cp > max_per_cp: max_per_cp = per_cp
1816        tokens = SplitName(name, token_counters)
1817        for c in name:
1818          if c in "0123456789": num_digits += 1
1819          IncCounter(char_counters, c)
1820  print
1821  for pname in name_pnames:
1822    print("'%s' character names: %d / %d bytes" %
1823          (pname, counts[pname], total_lengths[pname]))
1824  print("%d total bytes in character names" % sum(total_lengths.itervalues()))
1825  print("%d name-characters: %s" %
1826        (len(name_chars), "".join(sorted(name_chars))))
1827  print("%d digits 0-9" % num_digits)
1828  count_chars = [(count, c) for (c, count) in char_counters.items()]
1829  count_chars.sort(reverse=True)
1830  for cc in count_chars:
1831    print("name-chars: %6d * '%s'" % cc)
1832  print("max. name length: %d" % max_length)
1833  print("max. length of all (names+NUL) per cp: %d" % max_per_cp)
1834
1835  token_lengths = sum([len(t) + 1 for t in token_counters])
1836  print("%d total tokens, %d bytes with NUL" %
1837        (len(token_counters), token_lengths))
1838
1839  counts_tokens = []
1840  for (token, count) in token_counters.items():
1841    # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time
1842    # but have to store the token string itself with a length or terminator byte,
1843    # plus a 2-byte entry in an token index table.
1844    savings = count * (len(token) - 1) - (len(token) + 1 + 2)
1845    if savings > 0:
1846      counts_tokens.append((savings, count, token))
1847  counts_tokens.sort(reverse=True)
1848  print("%d tokens might save space with 1-byte codes" % len(counts_tokens))
1849
1850  # Codes=bytes, 40 byte values for name_chars.
1851  # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens.
1852  # Make each 2-byte token the token string index itself, rather than
1853  # and index into a string index table.
1854  # More lead bytes but also more savings.
1855  num_units = 256
1856  max_lead = (token_lengths + 255) / 256
1857  max_token_units = num_units - len(name_chars)
1858  results = []
1859  for num_lead in range(min(max_lead, max_token_units) + 1):
1860    max1 = max_token_units - num_lead
1861    ct = counts_tokens[:max1]
1862    tokens1 = set([t for (s, c, t) in ct])
1863    for (token, count) in token_counters.items():
1864      if token in tokens1: continue
1865      # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time
1866      # but have to store the token string itself with a length or terminator byte.
1867      savings = count * (len(token) - 2) - (len(token) + 1)
1868      if savings > 0:
1869        ct.append((savings, count, token))
1870    ct.sort(reverse=True)
1871    # A 2-byte-code-token index cannot be limit_t_lengths or higher.
1872    limit_t_lengths = num_lead * 256
1873    token2_index = 0
1874    for i in range(max1, len(ct)):
1875      if token2_index >= limit_t_lengths:
1876        del ct[i:]
1877        break
1878      token2_index += len(ct[i][2]) + 1
1879    cumul_savings = sum([s for (s, c, t) in ct])
1880    # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" %
1881    #        (max1, len(ct), cumul_savings))
1882    results.append((cumul_savings, max1, ct))
1883  best = max(results)  # (cumul_savings, max1, ct)
1884
1885  max1 = best[1]
1886  print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" %
1887         (best[0], max1, max_token_units - max1))
1888  counts_tokens = best[2]
1889  cumul_savings = 0
1890  for i in range(len(counts_tokens)):
1891    n = 1 if i < max1 else 2
1892    i1 = i + 1
1893    t = counts_tokens[i]
1894    cumul_savings += t[0]
1895    if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens):
1896      print(("%04d. cumul. %6d bytes save %6d bytes from " +
1897              "%5d * %d-byte token for %2d='%s'") %
1898          (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
1899
1900# ICU API ------------------------------------------------------------------ ***
1901
1902# Sample line to match:
1903#    UCHAR_UNIFIED_IDEOGRAPH=29,
1904_uchar_re = re.compile(
1905    " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
1906
1907# Sample line to match:
1908#    /** Zs @stable ICU 2.0 */
1909_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
1910
1911# Sample line to match:
1912#    U_SPACE_SEPARATOR         = 12,
1913_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
1914
1915# Sample line to match:
1916#    /** L @stable ICU 2.0 */
1917_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
1918
1919# Sample line to match:
1920#    U_LEFT_TO_RIGHT               = 0,
1921_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
1922
1923# Sample line to match:
1924#    UBLOCK_CYRILLIC =9,
1925_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
1926
1927# Sample line to match:
1928#    U_EA_AMBIGUOUS,
1929_prop_and_value_re = re.compile(
1930    " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
1931
1932# Sample line to match if it has matched _prop_and_value_re
1933# (we want to exclude aliases):
1934#    U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
1935_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
1936
1937def ParseUCharHeader(icu4c_src_root):
1938  uchar_path = os.path.join(icu4c_src_root, "source",
1939                            "common", "unicode", "uchar.h")
1940  with open(uchar_path, "r") as uchar_file:
1941    mode = ""  # Mode string (=pname) during context-sensitive parsing.
1942    comment_value = ""  # Property value from a comment preceding an enum.
1943    # Note: The enum UProperty is first in uchar.h, before the enums for values.
1944    for line in uchar_file:
1945      # Parse some enums via context-sensitive "modes".
1946      # Necessary because the enum constant names do not contain
1947      # enough information.
1948      if "enum UCharCategory" in line:
1949        mode = "gc"
1950        comment_value = ""
1951        continue
1952      if mode == "gc":
1953        if line.startswith("}"):
1954          mode = ""
1955          continue
1956        match = _gc_comment_re.match(line)
1957        if match:
1958          comment_value = match.group(1)
1959          continue
1960        match = _gc_re.match(line)
1961        if match and comment_value:
1962          gc_enum = match.group(1)
1963          prop = _properties["gc"]
1964          vname = GetShortPropertyValueName(prop, comment_value)
1965          icu_values = _pname_to_icu_prop["gc"][2]
1966          icu_values.append((gc_enum, vname))
1967        comment_value = ""
1968        continue
1969      if "enum UCharDirection {" in line:
1970        mode = "bc"
1971        comment_value = ""
1972        continue
1973      if mode == "bc":
1974        if line.startswith("}"):
1975          mode = ""
1976          continue
1977        match = _bc_comment_re.match(line)
1978        if match:
1979          comment_value = match.group(1)
1980          continue
1981        match = _bc_re.match(line)
1982        if match and comment_value:
1983          bc_enum = match.group(1)
1984          prop = _properties["bc"]
1985          vname = GetShortPropertyValueName(prop, comment_value)
1986          icu_values = _pname_to_icu_prop["bc"][2]
1987          icu_values.append((bc_enum, vname))
1988        comment_value = ""
1989        continue
1990      # No mode, parse enum constants whose names contain
1991      # enough information to parse without requiring context.
1992      match = _uchar_re.match(line)
1993      if match:
1994        prop_enum = match.group(1)
1995        if prop_enum.endswith("_LIMIT"):
1996          # Ignore "UCHAR_BINARY_LIMIT=57," etc.
1997          continue
1998        pname = GetShortPropertyName(prop_enum[6:])
1999        icu_prop = (prop_enum, pname, [])
2000        _icu_properties.append(icu_prop)
2001        _pname_to_icu_prop[pname] = icu_prop
2002        continue
2003      match = _ublock_re.match(line)
2004      if match:
2005        prop_enum = match.group(1)
2006        if prop_enum == "UBLOCK_COUNT":
2007          continue
2008        prop = _properties["blk"]
2009        vname = GetShortPropertyValueName(prop, prop_enum[7:])
2010        icu_values = _pname_to_icu_prop["blk"][2]
2011        icu_values.append((prop_enum, vname))
2012        continue
2013      match = _prop_and_value_re.match(line)
2014      if match:
2015        (prop_enum, vname) = match.group(1, 3)
2016        if vname == "COUNT" or _prop_and_alias_re.match(line):
2017          continue
2018        pname = GetShortPropertyName(match.group(2))
2019        prop = _properties[pname]
2020        vname = GetShortPropertyValueName(prop, vname)
2021        icu_values = _pname_to_icu_prop[pname][2]
2022        icu_values.append((prop_enum, vname))
2023  # ccc, lccc, tccc use their numeric values as "enum" values.
2024  # In the UCD data, these numeric values are the first value names,
2025  # followed by the short & long value names.
2026  # List the ccc values in numeric order.
2027  prop = _properties["ccc"]
2028  icu_values = _pname_to_icu_prop["ccc"][2]
2029  for ccc in sorted([int(name) for name in prop[2]]):
2030    icu_values.append((ccc, str(ccc)))
2031  _pname_to_icu_prop["lccc"][2].extend(icu_values)  # Copy ccc -> lccc.
2032  _pname_to_icu_prop["tccc"][2].extend(icu_values)  # Copy ccc -> tccc.
2033
2034  # No need to parse predictable General_Category_Mask enum constants.
2035  # Just define them in ASCII order.
2036  prop = _properties["gcm"]
2037  icu_values = _pname_to_icu_prop["gcm"][2]
2038  for vname in sorted(prop[2]):
2039    icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
2040  # Hardcode known values for the normalization quick check properties,
2041  # see unorm2.h for the UNormalizationCheckResult enum.
2042  icu_values = _pname_to_icu_prop["NFC_QC"][2]
2043  icu_values.append(("UNORM_NO", "N"))
2044  icu_values.append(("UNORM_YES", "Y"))
2045  icu_values.append(("UNORM_MAYBE", "M"))
2046  _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values)  # Copy NFC -> NFKC.
2047  # No "maybe" values for NF[K]D.
2048  icu_values = _pname_to_icu_prop["NFD_QC"][2]
2049  icu_values.append(("UNORM_NO", "N"))
2050  icu_values.append(("UNORM_YES", "Y"))
2051  _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values)  # Copy NFD -> NFKD.
2052
2053
2054# Sample line to match:
2055#    USCRIPT_LOMA   = 139,/* Loma */
2056_uscript_re = re.compile(
2057    " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
2058
2059def ParseUScriptHeader(icu4c_src_root):
2060  uscript_path = os.path.join(icu4c_src_root, "source",
2061                              "common", "unicode", "uscript.h")
2062  icu_values = _pname_to_icu_prop["sc"][2]
2063  with open(uscript_path, "r") as uscript_file:
2064    for line in uscript_file:
2065      match = _uscript_re.match(line)
2066      if match:
2067        (script_enum, script_code) = match.group(1, 2)
2068        icu_values.append((script_enum, script_code))
2069
2070
2071def CheckPNamesData():
2072  """Checks that every ICU property has a full set of value enum constants,
2073  and that the _icu_properties value names map back to the UCD."""
2074  missing_enums = []
2075  for (p_enum, pname, values) in _icu_properties:
2076    prop = _properties[pname]
2077    vnames = set(prop[2])  # Modifiable copy of the set of short value names.
2078    for (v_enum, vname) in values:
2079      if vname not in vnames:
2080        raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
2081                         (pname, vname, v_enum))
2082      vnames.remove(vname)
2083    # Exceptions to the all-values check:
2084    # - ICU does not have specific enum values for binary No/Yes.
2085    # - ICU represents Age values via UVersionInfo rather than enum constants.
2086    # - gc: ICU enum UCharCategory only has the single-category values.
2087    #       (ICU's gcm property has all of the UCD gc property values.)
2088    if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
2089      missing_enums.append((pname, vnames))
2090  if missing_enums:
2091    raise ValueError(
2092        "missing uchar.h enum constants for some property values: %s" %
2093        missing_enums)
2094
2095
2096def WritePNamesDataHeader(out_path):
2097  with open(out_path, "w") as out_file:
2098    out_file.write("""// © 2016 and later: Unicode, Inc. and others.
2099// License & terms of use: http://www.unicode.org/copyright.html
2100/**
2101 * Copyright (C) 2002-2016, International Business Machines Corporation and
2102 * others. All Rights Reserved.
2103 *
2104 * machine-generated by: icu/tools/unicode/py/preparseucd.py
2105 */
2106
2107""")
2108
2109    # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties
2110    # and values in the order of their definition,
2111    # and this function writes them in that order.
2112    # Since the ICU API constants are stable and new values are only
2113    # appended at the end
2114    # (new properties are added at the end of each binary/enum/... range),
2115    # the output is stable as well.
2116    # When a property or value constant is renamed,
2117    # it only changes the name itself in the output;
2118    # it does not move in the output since there is no sorting.
2119    # This minimizes diffs and assists with reviewing and evaluating updates.
2120
2121    version = _ucd_version.split('.')
2122    while len(version) < 4: version.append("0")
2123    out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version))
2124
2125    # Count the maximum number of aliases for any property or value.
2126    # We write the final value at the end.
2127    max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"]))
2128
2129    # Write an array of "binprop" Value object initializers
2130    # with the value aliases shared among all binary properties.
2131    out_file.write("static const Value VALUES_binprop[2] = {\n")
2132    out_file.write('    Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
2133    out_file.write('    Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
2134    out_file.write("};\n\n")
2135
2136    # For each property with named values, write an array of
2137    # Value object initializers with the value enum and the aliases.
2138    for (p_enum, pname, values) in _icu_properties:
2139      prop = _properties[pname]
2140      aliases = prop[1]
2141      if len(aliases) > max_aliases: max_aliases = len(aliases)
2142      if not values: continue
2143      out_file.write("static const Value VALUES_%s[%d] = {\n" %
2144                     (pname, len(values)))
2145      for (v_enum, vname) in values:
2146        aliases = _properties[pname][3][vname]
2147        # ccc, lccc, tccc: Omit the numeric strings from the aliases.
2148        # (See the comment about ccc in the PropertyValueAliases.txt header.)
2149        if pname.endswith("ccc"): aliases = aliases[1:]
2150        if len(aliases) > max_aliases: max_aliases = len(aliases)
2151        cast = "(int32_t)" if pname == "gcm" else ""
2152        out_file.write('    Value(%s%s, "%s"),\n' %
2153                       (cast, v_enum, " ".join(aliases)))
2154      out_file.write("};\n\n")
2155
2156    # For each property, write a Property object initializer
2157    # with the property enum, its aliases, and a reference to its values.
2158    out_file.write("static const Property PROPERTIES[%d] = {\n" %
2159                   len(_icu_properties))
2160    for (enum, pname, values) in _icu_properties:
2161      prop = _properties[pname]
2162      aliases = " ".join(prop[1])
2163      if prop[0] == "Binary":
2164        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
2165      elif values:  # Property with named values.
2166        out_file.write('    Property(%s, "%s", VALUES_%s, %d),\n' %
2167                       (enum, aliases, pname, len(values)))
2168      else:
2169        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
2170    out_file.write("};\n\n")
2171
2172    out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases)
2173
2174# main() ------------------------------------------------------------------- ***
2175
2176def main():
2177  global _null_or_defaults
2178  only_ppucd = False
2179  if len(sys.argv) == 3:
2180    (ucd_root, icu_src_root) = sys.argv[1:3]
2181    ppucd_path = None
2182  elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd":
2183    # For debugging:
2184    # preparseucd.py  path/to/UCD/root  --only_ppucd  path/to/ppucd/outputfile
2185    ucd_root = sys.argv[1]
2186    ppucd_path = sys.argv[3]
2187    only_ppucd = True
2188    icu_src_root = "/tmp/ppucd"
2189  else:
2190    print("Usage: %s  path/to/UCD/root  path/to/ICU/src/root" % sys.argv[0])
2191    return
2192  icu4c_src_root = os.path.join(icu_src_root, "icu4c")
2193  icu_tools_root = os.path.join(icu_src_root, "tools")
2194  source_files = []
2195  for root, dirs, files in os.walk(ucd_root):
2196    for file in files:
2197      source_files.append(os.path.join(root, file))
2198  PreprocessFiles(source_files, icu4c_src_root)
2199  # Parse the processed files in a particular order.
2200  for files in _files_to_parse:
2201    for (basename, path, parser) in files:
2202      print("Parsing %s" % basename)
2203      value = _files[basename]
2204      # Unicode data files are in UTF-8.
2205      charset = "UTF-8"
2206      if basename == "NamesList.txt":
2207        # The NamesList used to be in Latin-1 before Unicode 6.2.
2208        numeric_ucd_version = [int(field) for field in _ucd_version.split('.')]
2209        if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1"
2210      in_file = codecs.open(path, "r", charset)
2211      with in_file:
2212        parser(in_file)
2213  _null_or_defaults = _null_values.copy()
2214  _null_or_defaults.update(_defaults)
2215  # Every Catalog and Enumerated property must have a default value,
2216  # from a @missing line. "nv" = "null value".
2217  pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"]
2218  if pnv:
2219    raise Exception("no default values (@missing lines) for " +
2220                    "some Catalog or Enumerated properties: %s " % pnv)
2221  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
2222  if not only_ppucd:
2223    # Write Normalizer2 input text files.
2224    # Do this before compacting the data so that we need not handle fallbacks.
2225    norm2_path = os.path.join(unidata_path, "norm2")
2226    if not os.path.exists(norm2_path): os.makedirs(norm2_path)
2227    WriteNorm2(norm2_path)
2228  # Optimize block vs. cp properties.
2229  CompactBlocks()
2230  # Write the ppucd.txt output file.
2231  # Use US-ASCII so that ICU tests can parse it in the platform charset,
2232  # which may be EBCDIC.
2233  # Fix up non-ASCII data (NamesList.txt headings) to fit.
2234  if not ppucd_path:
2235    ppucd_path = os.path.join(unidata_path, "ppucd.txt")
2236  with codecs.open(ppucd_path, "w", "US-ASCII") as out_file:
2237    WritePreparsedUCD(out_file)
2238    out_file.flush()
2239
2240  # TODO: PrintNameStats()
2241
2242  if only_ppucd: return
2243
2244  # ICU data for property & value names API
2245  ParseUCharHeader(icu4c_src_root)
2246  ParseUScriptHeader(icu4c_src_root)
2247  CheckPNamesData()
2248  genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
2249  if not os.path.exists(genprops_path): os.makedirs(genprops_path)
2250  out_path = os.path.join(genprops_path, "pnames_data.h")
2251  WritePNamesDataHeader(out_path)
2252
2253
2254if __name__ == "__main__":
2255  main()
2256