1#!/usr/bin/python3 -B 2# -*- coding: utf-8 -*- 3# © 2016 and later: Unicode, Inc. and others. 4# License & terms of use: http://www.unicode.org/copyright.html 5# Copyright (c) 2009-2016 International Business Machines 6# Corporation and others. All Rights Reserved. 7# 8# file name: preparseucd.py 9# encoding: US-ASCII 10# tab size: 8 (not used) 11# indentation:4 12# 13# created on: 2011nov03 (forked from ucdcopy.py) 14# created by: Markus W. Scherer 15# 16# Copies Unicode Character Database (UCD) files from a tree 17# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/ 18# to ICU's source/data/unidata/ and source/test/testdata/ 19# and modifies some of the files to make them more compact. 20# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax. 21# 22# Invoke with two command-line parameters: 23# 1. source folder with UCD & idna files 24# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools) 25# 26# Sample invocation: 27# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src 28 29import array 30import bisect 31import codecs 32import os 33import os.path 34import re 35import shutil 36import sys 37 38# Unicode version ---------------------------------------------------------- *** 39 40_ucd_version = "?" 41 42# ISO 15924 script codes --------------------------------------------------- *** 43 44# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html 45# that are not yet in the UCD. 46_scripts_only_in_iso15924 = ( 47 "Afak", "Blis", "Cirt", "Cyrs", 48 "Egyd", "Egyh", "Geok", 49 "Hanb", "Hans", "Hant", 50 "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma", 51 "Maya", "Moon", "Nkgb", "Phlv", "Roro", 52 "Sara", "Syre", "Syrj", "Syrn", 53 "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx" 54) 55 56# Properties --------------------------------------------------------------- *** 57 58# Properties that we do not want to store in ppucd.txt. 59# Not a frozenset so that we can add aliases for simpler subsequent testing. 60_ignored_properties = set(( 61 # Other_Xyz only contribute to Xyz, store only the latter. 62 "OAlpha", 63 "ODI", 64 "OGr_Ext", 65 "OIDC", 66 "OIDS", 67 "OLower", 68 "OMath", 69 "OUpper", 70 # Further properties that just contribute to others. 71 "CE", # Composition_Exclusion just contributes to Full_Composition_Exclusion. 72 "JSN", 73 # These properties just don't seem useful. 74 # They are deprecated since Unicode 6.0. 75 "XO_NFC", 76 "XO_NFD", 77 "XO_NFKC", 78 "XO_NFKD", 79 # ICU does not use Unihan properties. 80 "cjkAccountingNumeric", 81 "cjkOtherNumeric", 82 "cjkPrimaryNumeric", 83 "cjkCompatibilityVariant", 84 "cjkIICore", 85 "cjkIRG_GSource", 86 "cjkIRG_HSource", 87 "cjkIRG_JSource", 88 "cjkIRG_KPSource", 89 "cjkIRG_KSource", 90 "cjkIRG_MSource", 91 "cjkIRG_SSource", 92 "cjkIRG_TSource", 93 "cjkIRG_UKSource", 94 "cjkIRG_USource", 95 "cjkIRG_VSource", 96 "cjkRSUnicode" 97)) 98 99# These properties (short names) map code points to 100# strings or other unusual values (property types String or Miscellaneous) 101# that cannot be block-compressed (or would be confusing). 102_uncompressible_props = frozenset(( 103 "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC", 104 "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF", 105 # scx is block-compressible. 106 "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc" 107)) 108 109# Dictionary of properties. 110# Keyed by normalized property names and aliases. 111# Each value is a tuple with 112# 0: Type of property (binary, enum, ...) 113# 1: List of aliases; short & long name followed by other aliases. 114# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt. 115# 2: Set of short property value names. 116# 3: Dictionary of property values. 117# For Catalog & Enumerated properties, 118# maps each value name to a list of aliases. 119# Empty for other types of properties. 120_properties = {} 121 122# Dictionary of binary-property values which we store as False/True. 123# Same as the values dictionary of one of the binary properties. 124_binary_values = {} 125 126# Dictionary of null values. 127# Keyed by short property names. 128# These are type-specific values for properties that occur in the data. 129# They are overridden by _defaults, block and code point properties. 130_null_values = {} 131 132# Property value names for null values. 133# We do not store these in _defaults. 134_null_names = frozenset(("<none>", "NaN")) 135 136# Dictionary of explicit default property values. 137# Keyed by short property names. 138_defaults = {"gc": "Cn"} 139 140# _null_values overridden by explicit _defaults. 141# Initialized after parsing is done. 142_null_or_defaults = {} 143 144# List of properties with an ICU UProperty enum. 145# Each item is an (enum, pname, values) tuple. 146# - enum: the ICU enum UProperty constant string 147# - pname: the UCD short property name 148# - values: list of (enum, vname) pairs per property value 149# - enum: the ICU property value's enum constant string 150# - vname: the UCD short property value name 151_icu_properties = [] 152 153# Dictionary of short property names mapped to _icu_properties items. 154_pname_to_icu_prop = {} 155 156_non_alnum_re = re.compile("[^a-zA-Z0-9]") 157 158def NormPropName(pname): 159 """Returns a normalized form of pname. 160 Removes non-ASCII-alphanumeric characters and lowercases letters.""" 161 return _non_alnum_re.sub("", pname).lower() 162 163 164def GetProperty(pname): 165 """Returns the _properties value for the pname. 166 Returns null if the property is ignored. 167 Caches alternate spellings of the property name.""" 168 # Try the input name. 169 prop = _properties.get(pname) 170 if prop != None: return prop 171 if pname in _ignored_properties: return None 172 # Try the normalized input name. 173 norm_name = NormPropName(pname) 174 prop = _properties.get(norm_name) 175 if prop != None: 176 _properties[pname] = prop # Cache prop under this new name spelling. 177 return prop 178 elif pname in _ignored_properties: 179 _ignored_properties.add(pname) # Remember to ignore this new name spelling. 180 return None 181 else: 182 raise NameError("unknown property %s\n" % pname) 183 184 185def GetShortPropertyName(pname): 186 if pname in _null_values: return pname # pname is already the short name. 187 prop = GetProperty(pname) 188 if not prop: return "" # For ignored properties. 189 return prop[1][0] or prop[1][1] # Long name if no short name. 190 191 192def GetShortPropertyValueName(prop, vname): 193 if vname in prop[2]: return vname 194 values = prop[3] 195 aliases = values.get(vname) 196 if aliases == None: 197 norm_name = NormPropName(vname) 198 aliases = values.get(norm_name) 199 if aliases == None: 200 raise NameError("unknown value name %s for property %s\n" % 201 (vname, prop[1][0])) 202 values[vname] = aliases 203 return aliases[0] or aliases[1] # Long name if no short name. 204 205 206def NormalizePropertyValue(prop, vname): 207 if prop[2]: # Binary/Catalog/Enumerated property. 208 value = GetShortPropertyValueName(prop, vname) 209 if prop[0] == "Binary": 210 value = value == "Y" 211 if prop[1][0].endswith("ccc"): 212 value = int(value) 213 else: 214 value = vname 215 return value 216 217# Character data ----------------------------------------------------------- *** 218 219# Lists of NamesList h1 and h2 headings. 220# Each h1 value is a (start, end, comment) tuple. 221# Each h2 value is a (cp, comment) tuple. 222_h1 = [] 223_h2 = [] 224 225# List of Unicode blocks. 226# Each item is a tuple of start & end code point integers 227# and a dictionary of default property values. 228_blocks = [] 229 230# List of ranges with algorithmic names. 231# Each value is a list of [start, end, type, prefix] 232# where prefix is optional. 233_alg_names_ranges = [] 234 235# List of Unicode character ranges and their properties, 236# stored as an inversion map with range_start & props dictionary. 237# Starts with one range for all of Unicode without any properties. 238# Setting values subdivides ranges. 239_starts = array.array('l', [0, 0x110000]) # array of int32_t 240_props = [{}, {}] # props for 0 and 110000 241 242def FindRange(x): 243 """ Binary search for x in the inversion map. 244 Returns the smallest i where x < _starts[i]""" 245 return bisect.bisect(_starts, x) - 1 246 247 248def GetProps(c): 249 i = FindRange(c) 250 return _props[i] 251 252 253def UpdateProps(start, end, update): 254 assert 0 <= start <= end <= 0x10ffff 255 (need_to_update, do_update, u) = (update[0], update[1], update[2]) 256 # Find the index i of the range in _starts that contains start. 257 i = FindRange(start) 258 limit = end + 1 259 # Intersect [start, limit[ with ranges in _starts. 260 c_start = _starts[i] 261 c_limit = _starts[i + 1] 262 c_props = _props[i] 263 # c_start <= start < c_limit 264 if c_start < start: 265 update_limit = c_limit if c_limit <= limit else limit 266 if need_to_update(u, start, update_limit - 1, c_props): 267 # Split off [c_start, start[ with a copy of c_props. 268 i += 1 269 c_props = c_props.copy() 270 _starts.insert(i, start) 271 _props.insert(i, c_props) 272 c_start = start 273 # Modify all ranges that are fully inside [start, limit[. 274 while c_limit <= limit: 275 # start <= c_start < c_limit <= limit 276 if need_to_update(u, c_start, c_limit - 1, c_props): 277 do_update(u, c_start, c_limit - 1, c_props) 278 if c_limit == 0x110000: return 279 i += 1 280 c_start = c_limit 281 c_limit = _starts[i + 1] 282 c_props = _props[i] 283 if c_start < limit and need_to_update(u, c_start, limit - 1, c_props): 284 # Split off [limit, c_limit[ with a copy of c_props. 285 _starts.insert(i + 1, limit) 286 _props.insert(i + 1, c_props.copy()) 287 # Modify [c_start, limit[ c_props. 288 do_update(u, c_start, limit - 1, c_props) 289 290 291def NeedToSetProps(props, start, end, c_props): 292 """Returns True if props is not a sub-dict of c_props.""" 293 for (pname, value) in props.items(): 294 if pname not in c_props or value != c_props[pname]: return True 295 return False 296 297 298def DoSetProps(props, start, end, c_props): 299 c_props.update(props) 300 301 302def SetProps(start, end, props): 303 UpdateProps(start, end, (NeedToSetProps, DoSetProps, props)) 304 305 306def NeedToSetAlways(nv, start, end, c_props): 307 return True 308 309 310# For restoring boundaries after merging adjacent same-props ranges. 311def AddBoundary(x): 312 """Ensure that there is a range start/limit at x.""" 313 assert 0 <= x <= 0x10ffff 314 i = FindRange(x) 315 if _starts[i] == x: return 316 # Split the range at x. 317 c_start = _starts[i] 318 c_limit = _starts[i + 1] 319 c_props = _props[i] 320 # c_start < x < c_limit 321 i += 1 322 _starts.insert(i, x) 323 _props.insert(i, c_props.copy()) 324 325 326def SetDefaultValue(pname, value): 327 """Sets the property's default value. Ignores null values.""" 328 prop = GetProperty(pname) 329 if prop and value not in _null_names: 330 value = NormalizePropertyValue(prop, value) 331 if value != _null_values[prop[1][0]]: 332 _defaults[prop[1][0]] = value 333 SetProps(0, 0x10ffff, {prop[1][0]: value}) 334 335 336def SetBinaryPropertyToTrue(pname, start, end): 337 prop = GetProperty(pname) 338 if prop: 339 assert prop[0] == "Binary" 340 SetProps(start, end, {prop[1][0]: True}) 341 342 343def SetPropValue(prop, vname, start, end): 344 value = NormalizePropertyValue(prop, vname) 345 SetProps(start, end, {prop[1][0]: value}) 346 347 348def SetPropertyValue(pname, vname, start, end): 349 prop = GetProperty(pname) 350 if prop: SetPropValue(prop, vname, start, end) 351 352# Parsing ------------------------------------------------------------------ *** 353 354_stripped_cp_re = re.compile("([0-9a-fA-F]+)$") 355_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$") 356# Default value for all of Unicode. 357_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$") 358# Default value for some range. 359_missing2_re = re.compile("# *@missing: *(.+)$") 360 361def ReadUCDLines(in_file, want_ranges=True, want_other=False, 362 want_comments=False, want_missing=False): 363 """Parses lines from a semicolon-delimited UCD text file. 364 Strips comments, ignores empty and all-comment lines. 365 Returns a tuple (type, line, ...). 366 """ 367 for line in in_file: 368 line = line.strip() 369 if not line: continue 370 if line.startswith("#"): # whole-line comment 371 parse_data = False 372 if want_missing: 373 match = _missing_re.match(line) 374 if match: 375 fields = match.group(1).split(";") 376 for i in range(len(fields)): fields[i] = fields[i].strip() 377 yield ("missing", line, fields) 378 continue 379 match = _missing2_re.match(line) 380 if match: 381 # Strip the "missing" comment prefix and fall through to 382 # parse the remainder of the line like regular data. 383 parse_data = True 384 line = match.group(1) 385 if not parse_data: 386 if want_comments: yield ("comment", line) 387 continue 388 comment_start = line.find("#") # inline comment 389 if comment_start >= 0: 390 line = line[:comment_start].rstrip() 391 if not line: continue 392 fields = line.split(";") 393 for i in range(len(fields)): fields[i] = fields[i].strip() 394 if want_ranges: 395 first = fields[0] 396 match = _stripped_range_re.match(first) 397 if match: 398 start = int(match.group(1), 16) 399 end = int(match.group(2), 16) 400 yield ("range", line, start, end, fields) 401 continue 402 match = _stripped_cp_re.match(first) 403 if match: 404 c = int(match.group(1), 16) 405 yield ("range", line, c, c, fields) 406 continue 407 if want_other: 408 yield ("other", line, fields) 409 else: 410 raise SyntaxError("unable to parse line\n %s\n" % line) 411 412 413def AddBinaryProperty(short_name, long_name): 414 _null_values[short_name] = False 415 bin_prop = _properties["Math"] 416 prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3]) 417 _properties[short_name] = prop 418 _properties[long_name] = prop 419 _properties[NormPropName(short_name)] = prop 420 _properties[NormPropName(long_name)] = prop 421 422 423def AddSingleNameBinaryProperty(name): 424 # For some properties, the short name is the same as the long name. 425 _null_values[name] = False 426 bin_prop = _properties["Math"] 427 prop = ("Binary", [name, name], bin_prop[2], bin_prop[3]) 428 _properties[name] = prop 429 _properties[NormPropName(name)] = prop 430 431 432def AddPOSIXBinaryProperty(name): 433 # We only define a long name for ICU-specific (non-UCD) POSIX properties. 434 _null_values[name] = False 435 bin_prop = _properties["Math"] 436 prop = ("Binary", ["", name], bin_prop[2], bin_prop[3]) 437 _properties[name] = prop 438 _properties[NormPropName(name)] = prop 439 # This is to match UProperty UCHAR_POSIX_ALNUM etc. 440 _properties["posix" + NormPropName(name)] = prop 441 442 443# Match a comment line like 444# PropertyAliases-6.1.0.txt 445# and extract the Unicode version. 446_ucd_version_re = re.compile("# *PropertyAliases" + 447 "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" + 448 "\\.txt") 449 450def ParsePropertyAliases(in_file): 451 global _ucd_version 452 prop_type_nulls = { 453 "Binary": False, 454 "Catalog": "??", # Must be specified, e.g., in @missing line. 455 "Enumerated": "??", # Must be specified. 456 "Numeric": "NaN", 457 "String": "", 458 "Miscellaneous": "" 459 } 460 for data in ReadUCDLines(in_file, want_ranges=False, 461 want_other=True, want_comments=True): 462 if data[0] == "comment": 463 line = data[1] 464 match = _ucd_version_re.match(line) 465 if match: 466 _ucd_version = match.group(1) 467 else: 468 words = line[1:].lstrip().split() 469 if len(words) == 2 and words[1] == "Properties": 470 prop_type = words[0] 471 null_value = prop_type_nulls[prop_type] 472 else: 473 # type == "other" 474 aliases = data[2] 475 name = aliases[0] 476 if name in _ignored_properties: 477 for alias in aliases: 478 _ignored_properties.add(alias) 479 _ignored_properties.add(NormPropName(alias)) 480 else: 481 if name.endswith("ccc"): 482 _null_values[name] = 0 483 else: 484 _null_values[name] = null_value 485 prop = (prop_type, aliases, set(), {}) 486 for alias in aliases: 487 _properties[alias] = prop 488 _properties[NormPropName(alias)] = prop 489 # Add provisional and ICU-specific properties we need. 490 # We add some in support of runtime API, even if we do not write 491 # data for them to ppucd.txt (e.g., lccc & tccc). 492 # We add others just to represent UCD data that contributes to 493 # some functionality, although Unicode has not "blessed" them 494 # as separate properties (e.g., Turkic_Case_Folding). 495 496 # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt. 497 name = "Turkic_Case_Folding" 498 _null_values[name] = "" 499 prop = ("String", [name, name], set(), {}) 500 _properties[name] = prop 501 _properties[NormPropName(name)] = prop 502 # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions. 503 name = "Conditional_Case_Mappings" 504 _null_values[name] = "" 505 prop = ("Miscellaneous", [name, name], set(), {}) 506 _properties[name] = prop 507 _properties[NormPropName(name)] = prop 508 # lccc = ccc of first cp in canonical decomposition. 509 _null_values["lccc"] = 0 510 ccc_prop = list(_properties["ccc"]) 511 ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"] 512 prop = tuple(ccc_prop) 513 _properties["lccc"] = prop 514 _properties["Lead_Canonical_Combining_Class"] = prop 515 _properties["leadcanonicalcombiningclass"] = prop 516 # tccc = ccc of last cp in canonical decomposition. 517 _null_values["tccc"] = 0 518 ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"] 519 prop = tuple(ccc_prop) 520 _properties["tccc"] = prop 521 _properties["Trail_Canonical_Combining_Class"] = prop 522 _properties["trailcanonicalcombiningclass"] = prop 523 # Script_Extensions 524 if "scx" not in _properties: 525 _null_values["scx"] = "" 526 prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {}) 527 _properties["scx"] = prop 528 _properties["Script_Extensions"] = prop 529 _properties["scriptextensions"] = prop 530 # General Category as a bit mask. 531 _null_values["gcm"] = "??" 532 gc_prop = _properties["gc"] 533 prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3]) 534 _properties["gcm"] = prop 535 _properties["General_Category_Mask"] = prop 536 _properties["generalcategorymask"] = prop 537 # Various binary properties. 538 AddBinaryProperty("Sensitive", "Case_Sensitive") 539 AddBinaryProperty("nfdinert", "NFD_Inert") 540 AddBinaryProperty("nfkdinert", "NFKD_Inert") 541 AddBinaryProperty("nfcinert", "NFC_Inert") 542 AddBinaryProperty("nfkcinert", "NFKC_Inert") 543 AddBinaryProperty("segstart", "Segment_Starter") 544 # https://www.unicode.org/reports/tr51/#Emoji_Properties 545 AddBinaryProperty("Emoji", "Emoji") 546 AddBinaryProperty("EPres", "Emoji_Presentation") 547 AddBinaryProperty("EMod", "Emoji_Modifier") 548 AddBinaryProperty("EBase", "Emoji_Modifier_Base") 549 AddBinaryProperty("EComp", "Emoji_Component") 550 AddBinaryProperty("ExtPict", "Extended_Pictographic") 551 # https://www.unicode.org/reports/tr51/#Emoji_Sets 552 AddSingleNameBinaryProperty("Basic_Emoji") 553 AddSingleNameBinaryProperty("Emoji_Keycap_Sequence") 554 AddSingleNameBinaryProperty("RGI_Emoji_Modifier_Sequence") 555 AddSingleNameBinaryProperty("RGI_Emoji_Flag_Sequence") 556 AddSingleNameBinaryProperty("RGI_Emoji_Tag_Sequence") 557 AddSingleNameBinaryProperty("RGI_Emoji_ZWJ_Sequence") 558 AddSingleNameBinaryProperty("RGI_Emoji") 559 # C/POSIX character classes that do not have Unicode property [value] aliases. 560 # See uchar.h. 561 AddPOSIXBinaryProperty("alnum") 562 AddPOSIXBinaryProperty("blank") 563 AddPOSIXBinaryProperty("graph") 564 AddPOSIXBinaryProperty("print") 565 AddPOSIXBinaryProperty("xdigit") 566 567 568def ParsePropertyValueAliases(in_file): 569 global _binary_values 570 for data in ReadUCDLines(in_file, want_ranges=False, 571 want_other=True, want_missing=True): 572 if data[0] == "missing": 573 SetDefaultValue(data[2][0], data[2][1]) 574 else: 575 # type == "other" 576 fields = data[2] 577 pname = fields[0] 578 prop = GetProperty(pname) 579 if prop: 580 del fields[0] # Only the list of aliases remains. 581 short_name = fields[0] 582 if short_name == "n/a": # no short name 583 fields[0] = "" 584 short_name = fields[1] 585 prop[2].add(short_name) 586 values = prop[3] 587 for alias in fields: 588 if alias: 589 values[alias] = fields 590 values[NormPropName(alias)] = fields 591 if prop[0] == "Binary" and not _binary_values: 592 _binary_values = values 593 # Some of the @missing lines with non-null default property values 594 # are in files that we do not parse; 595 # either because the data for that property is easily 596 # (i.e., the @missing line would be the only reason to parse such a file) 597 # or because we compute the property at runtime, 598 # such as the Hangul_Syllable_Type. 599 if "dt" not in _defaults: # DerivedDecompositionType.txt 600 _defaults["dt"] = "None" 601 if "nt" not in _defaults: # DerivedNumericType.txt 602 _defaults["nt"] = "None" 603 if "hst" not in _defaults: # HangulSyllableType.txt 604 _defaults["hst"] = "NA" 605 if "gc" not in _defaults: # No @missing line in any .txt file? 606 _defaults["gc"] = "Cn" 607 # Copy the gc default value to gcm. 608 _defaults["gcm"] = _defaults["gc"] 609 # Add ISO 15924-only script codes. 610 # Only for the ICU script code API, not necessary for parsing the UCD. 611 script_prop = _properties["sc"] 612 short_script_names = script_prop[2] # set 613 script_values = script_prop[3] # dict 614 remove_scripts = [] 615 for script in _scripts_only_in_iso15924: 616 if script in short_script_names: 617 remove_scripts.append(script) 618 else: 619 short_script_names.add(script) 620 # Do not invent a Unicode long script name before the UCD adds the script. 621 script_list = [script, script] # [short, long] 622 script_values[script] = script_list 623 # Probably not necessary because 624 # we will not parse these scripts from the UCD: 625 script_values[NormPropName(script)] = script_list 626 if remove_scripts: 627 raise ValueError( 628 "remove %s from _scripts_only_in_iso15924" % remove_scripts) 629 630 631def ParseBlocks(in_file): 632 for data in ReadUCDLines(in_file, want_missing=True): 633 if data[0] == "missing": 634 SetDefaultValue("blk", data[2][0]) 635 else: 636 # type == "range" 637 (start, end, name) = (data[2], data[3], data[4][1]) 638 _blocks.append((start, end, {"blk": name})) 639 SetPropertyValue("blk", name, start, end) 640 _blocks.sort() 641 # Check for overlapping blocks. 642 prev_end = -1 643 for b in _blocks: 644 start = b[0] 645 end = b[1] 646 if prev_end >= start: 647 raise ValueError( 648 "block %04lX..%04lX %s overlaps with another " + 649 "ending at %04lX\n %s\n" % 650 (start, end, b[2]["blk"], prev_end)) 651 prev_end = end 652 653 654def ParseUnicodeData(in_file): 655 dt_prop = GetProperty("dt") 656 range_first_line = "" 657 range_first = -1 658 for data in ReadUCDLines(in_file, want_missing=True): 659 # type == "range" 660 (line, c, end, fields) = (data[1], data[2], data[3], data[4]) 661 assert c == end 662 name = fields[1] 663 if name.startswith("<"): 664 if name.endswith(", First>"): 665 if range_first >= 0: 666 raise SyntaxError( 667 "error: unterminated range started at\n %s\n" % 668 range_first_line) 669 range_first = c 670 range_first_line = line 671 continue 672 elif name.endswith(", Last>"): 673 if range_first < 0: 674 raise SyntaxError( 675 "error: range end without start at\n %s\n" % 676 line) 677 elif range_first > c: 678 raise SyntaxError( 679 "error: range start/end out of order at\n %s\n %s\n" % 680 (range_first_line, line)) 681 first_name = range_first_line.split(";")[1][1:-8] 682 name = name[1:-7] 683 if first_name != name: 684 raise SyntaxError( 685 "error: range start/end name mismatch at\n %s\n %s\n" % 686 (range_first_line, line)) 687 end = c 688 c = range_first 689 range_first = -1 690 # Remember algorithmic name ranges. 691 if "Ideograph" in name: 692 prefix = "CJK UNIFIED IDEOGRAPH-" 693 if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-" 694 _alg_names_ranges.append([c, end, "han", prefix]) 695 elif name == "Hangul Syllable": 696 _alg_names_ranges.append([c, end, "hangul"]) 697 name = "" 698 else: 699 # Ignore non-names like <control>. 700 name = "" 701 props = {} 702 if name: props["na"] = name 703 props["gc"] = fields[2] 704 ccc = int(fields[3]) 705 if ccc: props["ccc"] = ccc 706 props["bc"] = fields[4] 707 # Decomposition type & mapping. 708 dm = fields[5] 709 if dm: 710 if dm.startswith("<"): 711 dt_limit = dm.index(">") 712 dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit]) 713 dm = dm[dt_limit + 1:].lstrip() 714 else: 715 dt = "Can" 716 props["dt"] = dt 717 props["dm"] = dm 718 # Numeric type & value. 719 decimal = fields[6] 720 digit = fields[7] 721 nv = fields[8] 722 if (decimal and decimal != nv) or (digit and digit != nv): 723 raise SyntaxError("error: numeric values differ at\n %s\n" % line) 724 if nv: 725 # Map improper fractions to proper ones. 726 # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS 727 # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS 728 if nv == "2/12": 729 nv = "1/6" 730 elif nv == "3/12": 731 nv = "1/4" 732 elif nv == "4/12": 733 nv = "1/3" 734 elif nv == "6/12": 735 nv = "1/2" 736 elif nv == "8/12": 737 nv = "2/3" 738 elif nv == "9/12": 739 nv = "3/4" 740 elif nv == "10/12": 741 nv = "5/6" 742 props["nv"] = nv 743 props["nt"] = "De" if decimal else "Di" if digit else "Nu" 744 if fields[9] == "Y": props["Bidi_M"] = True 745 # ICU 49 and above does not support Unicode_1_Name any more. 746 # See ticket #9013. 747 # na1 = fields[10] 748 # if na1: props["na1"] = na1 749 # ISO_Comment is deprecated and has no values. 750 # isc = fields[11] 751 # if isc: props["isc"] = isc 752 # Simple case mappings. 753 suc = fields[12] 754 slc = fields[13] 755 stc = fields[14] 756 if suc: props["suc"] = suc 757 if slc: props["slc"] = slc 758 if stc: props["stc"] = stc 759 SetProps(c, end, props) 760 if range_first >= 0: 761 raise SyntaxError( 762 "error: unterminated range started at\n %s\n" % 763 range_first_line) 764 # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt. 765 SetPropertyValue("dt", "Can", 0xac00, 0xd7a3) 766 _alg_names_ranges.sort() 767 768 769_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$") 770_names_h2_re = re.compile("@\t\t(.+)") 771_names_char_re = re.compile("([0-9a-fA-F]+)\t.+") 772 773def ParseNamesList(in_file): 774 pending_h2 = "" 775 for line in in_file: 776 line = line.strip() 777 if not line: continue 778 match = _names_h1_re.match(line) 779 if match: 780 pending_h2 = "" # Drop a pending h2 when we get to an h1. 781 start = int(match.group(1), 16) 782 end = int(match.group(3), 16) 783 comment = match.group(2).replace(u"\xa0", " ") 784 _h1.append((start, end, comment)) 785 continue 786 match = _names_h2_re.match(line) 787 if match: 788 pending_h2 = match.group(1).replace(u"\xa0", " ") 789 continue 790 if pending_h2: 791 match = _names_char_re.match(line) 792 if match: 793 c = int(match.group(1), 16) 794 _h2.append((c, pending_h2)) 795 pending_h2 = "" 796 _h1.sort() 797 _h2.sort() 798 799 800def ParseNamedProperties(in_file): 801 """Parses a .txt file where the first column is a code point range 802 and the second column is a property name. 803 Sets binary properties to True, 804 and other properties to the values in the third column.""" 805 for data in ReadUCDLines(in_file, want_missing=True): 806 if data[0] == "missing": 807 SetDefaultValue(data[2][0], data[2][1]) 808 else: 809 # type == "range" 810 if len(data[4]) == 2: 811 SetBinaryPropertyToTrue(data[4][1], data[2], data[3]) 812 else: 813 SetPropertyValue(data[4][1], data[4][2], data[2], data[3]) 814 815 816def ParseOneProperty(in_file, pname): 817 """Parses a .txt file where the first column is a code point range 818 and the second column is the value of a known property.""" 819 prop = GetProperty(pname) 820 for data in ReadUCDLines(in_file, want_missing=True): 821 if data[0] == "missing": 822 SetDefaultValue(pname, data[2][0]) 823 else: 824 # type == "range" 825 SetPropValue(prop, data[4][1], data[2], data[3]) 826 827 828def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg") 829def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age") 830def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc") 831def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg") 832def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt") 833def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea") 834def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB") 835def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC") 836def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC") 837def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb") 838def ParseScripts(in_file): ParseOneProperty(in_file, "sc") 839def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx") 840def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB") 841def ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo") 842def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB") 843 844 845def DoSetNameAlias(alias, start, end, c_props): 846 if "Name_Alias" in c_props: 847 c_props["Name_Alias"] += ',' + alias 848 else: 849 c_props["Name_Alias"] = alias 850 851 852def ParseNameAliases(in_file): 853 """Parses Name_Alias from NameAliases.txt. 854 A character can have multiple aliases. 855 856 In Unicode 6.0, there are two columns, 857 with a name correction in the second column. 858 859 In Unicode 6.1, there are three columns. 860 The second contains an alias, the third its type. 861 The documented types are: 862 correction, control, alternate, figment, abbreviation 863 864 This function does not sort the types, assuming they appear in this order.""" 865 for data in ReadUCDLines(in_file): 866 start = data[2] 867 end = data[3] 868 if start != end: 869 raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" % 870 (start, end)) 871 fields = data[4] 872 if len(fields) == 2: 873 alias = "correction=" + fields[1] 874 else: 875 alias = fields[2] + '=' + fields[1] 876 update = (NeedToSetAlways, DoSetNameAlias, alias) 877 UpdateProps(start, end, update) 878 879 880def NeedToSetNumericValue(nv, start, end, c_props): 881 c_nv = c_props.get("nv") 882 if c_nv == None: 883 # DerivedNumericValues.txt adds a Numeric_Value. 884 assert "nt" not in c_props 885 return True 886 if nv != c_nv: 887 raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " + 888 "but DerivedNumericValues.txt has nv=%s") % 889 (c_nv, start, end, nv)) 890 return False 891 892 893def DoSetNumericValue(nv, start, end, c_props): 894 c_props.update({"nt": "Nu", "nv": nv}) 895 896 897def ParseDerivedNumericValues(in_file): 898 """Parses DerivedNumericValues.txt. 899 For most characters, the numeric type & value were parsed previously 900 from UnicodeData.txt but that does not show the values for Han characters. 901 Here we check that values match those from UnicodeData.txt 902 and add new ones.""" 903 # Ignore the @missing line which has an incorrect number of fields, 904 # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1). 905 # Also, "NaN" is just the Numeric null value anyway. 906 for data in ReadUCDLines(in_file): 907 # Conditional update to the numeric value in the 4th field. 908 update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3]) 909 UpdateProps(data[2], data[3], update) 910 911 912def ParseCaseFolding(in_file): 913 for data in ReadUCDLines(in_file, want_missing=True): 914 if data[0] == "missing": 915 assert data[2][0] == "C" # common to scf & cf 916 SetDefaultValue("scf", data[2][1]) 917 SetDefaultValue("cf", data[2][1]) 918 else: 919 # type == "range" 920 start = data[2] 921 end = data[3] 922 status = data[4][1] 923 mapping = data[4][2] 924 assert status in "CSFT" 925 if status == "C": 926 SetProps(start, end, {"scf": mapping, "cf": mapping}) 927 elif status == "S": 928 SetPropertyValue("scf", mapping, start, end) 929 elif status == "F": 930 SetPropertyValue("cf", mapping, start, end) 931 else: # status == "T" 932 SetPropertyValue("Turkic_Case_Folding", mapping, start, end) 933 934 935def DoSetConditionalCaseMappings(ccm, start, end, c_props): 936 if "Conditional_Case_Mappings" in c_props: 937 c_props["Conditional_Case_Mappings"] += ',' + ccm 938 else: 939 c_props["Conditional_Case_Mappings"] = ccm 940 941 942def ParseSpecialCasing(in_file): 943 for data in ReadUCDLines(in_file, want_missing=True): 944 if data[0] == "missing": 945 SetDefaultValue("lc", data[2][0]) 946 SetDefaultValue("tc", data[2][1]) 947 SetDefaultValue("uc", data[2][2]) 948 else: 949 # type == "range" 950 start = data[2] 951 end = data[3] 952 fields = data[4] 953 if len(fields) < 5 or not fields[4]: 954 # Unconditional mappings. 955 SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]}) 956 else: 957 # Conditional_Case_Mappings 958 ccm = (fields[4] + ":lc=" + fields[1] + 959 "&tc=" + fields[2] + "&uc=" + fields[3]) 960 update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm) 961 UpdateProps(start, end, update) 962 963 964def ParseBidiBrackets(in_file): 965 for data in ReadUCDLines(in_file, want_missing=True): 966 if data[0] == "missing": 967 SetDefaultValue("bpt", data[2][1]) 968 else: 969 # type == "range" 970 start = data[2] 971 end = data[3] 972 assert start == end 973 mapping = data[4][1] 974 bracket_type = data[4][2] 975 SetProps(start, end, {"bpb": mapping, "bpt": bracket_type}) 976 977# Postprocessing ----------------------------------------------------------- *** 978 979def PrintedSize(pname, value): 980 if isinstance(value, bool): 981 if value: 982 return len(pname) + 1 # ";pname" 983 else: 984 return len(pname) + 2 # ";-pname" 985 else: 986 return len(pname) + len(str(value)) + 2 # ";pname=value" 987 988 989def CompactBlock(b, i): 990 assert b[0] == _starts[i] 991 b_props = b[2] # Normally just blk from Blocks.txt. 992 # b_props["blk"] has not been canonicalized yet. 993 b_props["blk"] = _props[i]["blk"] 994 orig_i = i 995 # Count the number of occurrences of each property's value in this block. 996 # To minimize the output, count the number of assigned ranges, 997 # not the number of code points. 998 num_ranges = 0 999 prop_counters = {} 1000 if "gc" in b_props: 1001 b_is_unassigned = b_props["gc"] == "Cn" # Unreachable with normal data. 1002 else: 1003 b_is_unassigned = _defaults["gc"] == "Cn" # This is expected to be true. 1004 while True: 1005 start = _starts[i] 1006 if start > b[1]: break 1007 props = _props[i] 1008 if "gc" in props: 1009 is_unassigned = props["gc"] == "Cn" 1010 else: 1011 is_unassigned = b_is_unassigned 1012 if is_unassigned: 1013 # Compact an unassigned range inside the block and 1014 # mark it to be written with "unassigned". 1015 # It falls back to default properties, not block properties, 1016 # except for the blk=Block property. 1017 assert props["blk"] == b_props["blk"] 1018 del props["blk"] 1019 for pname in list(props.keys()): # .keys() is a copy so we can del props[pname]. 1020 if props[pname] == _null_or_defaults[pname]: del props[pname] 1021 # What remains are unusual default values for unassigned code points. 1022 # For example, bc=R or lb=ID. 1023 # See http://www.unicode.org/reports/tr44/#Default_Values_Table 1024 props["unassigned"] = True 1025 else: 1026 for (pname, value) in props.items(): 1027 if pname in prop_counters: 1028 counter = prop_counters[pname] 1029 else: 1030 counter = {_null_or_defaults[pname]: num_ranges} 1031 prop_counters[pname] = counter 1032 if value in counter: 1033 counter[value] += 1 1034 else: 1035 counter[value] = 1 1036 # Also count default values for properties that do not occur in a range. 1037 for pname in prop_counters: 1038 if pname not in props: 1039 counter = prop_counters[pname] 1040 value = _null_or_defaults[pname] 1041 counter[value] += 1 1042 num_ranges += 1 1043 # Invariant: For each counter, the sum of counts must equal num_ranges. 1044 i += 1 1045 # For each property that occurs within this block, 1046 # set the value that reduces the file size the most as a block property value. 1047 # This is usually the most common value. 1048 for (pname, counter) in prop_counters.items(): 1049 default_value = _null_or_defaults[pname] 1050 default_size = PrintedSize(pname, default_value) * counter[default_value] 1051 max_value = None 1052 max_count = 0 1053 max_savings = 0 1054 for (value, count) in counter.items(): 1055 if value != default_value and count > 1: 1056 # Does the file get smaller by setting the block default? 1057 # We save writing the block value as often as it occurs, 1058 # minus once for writing it for the block, 1059 # minus writing the default value instead. 1060 savings = PrintedSize(pname, value) * (count - 1) - default_size 1061 # For two values with the same savings, pick the one that compares lower, 1062 # to make this deterministic (avoid flip-flopping). 1063 if (savings > max_savings or 1064 (savings > 0 and savings == max_savings and value < max_value)): 1065 max_value = value 1066 max_count = count 1067 max_savings = savings 1068 # Do not compress uncompressible properties, 1069 # with an exception for many empty-string values in a block 1070 # (NFKC_CF='' for tags and variation selectors). 1071 if (max_savings > 0 and 1072 ((pname not in _uncompressible_props) or 1073 (max_value == '' and max_count >= 12))): 1074 b_props[pname] = max_value 1075 # For each range and property, remove the default+block value 1076 # but set the default value if that property was not set 1077 # (i.e., it used to inherit the default value). 1078 b_defaults = _null_or_defaults.copy() 1079 b_defaults.update(b_props) 1080 i = orig_i 1081 while True: 1082 start = _starts[i] 1083 if start > b[1]: break 1084 props = _props[i] 1085 if "unassigned" not in props: 1086 # Compact an assigned range inside the block. 1087 for pname in prop_counters: 1088 if pname in props: 1089 if props[pname] == b_defaults[pname]: del props[pname] 1090 elif pname in b_props: 1091 # b_props only has non-default values. 1092 # Set the default value if it used to be inherited. 1093 props[pname] = _null_or_defaults[pname] 1094 # If there is only one assigned range, then move all of its properties 1095 # to the block. 1096 if num_ranges == 1: 1097 b_props.update(props) 1098 props.clear() 1099 i += 1 1100 # Return the _starts index of the first range after this block. 1101 return i 1102 1103 1104def CompactNonBlock(limit, i): 1105 """Remove default property values from between-block ranges.""" 1106 default_is_unassigned = _defaults["gc"] == "Cn" # This is expected to be true. 1107 while True: 1108 start = _starts[i] 1109 if start >= limit: break 1110 props = _props[i] 1111 if "gc" in props: 1112 is_unassigned = props["gc"] == "Cn" 1113 else: 1114 is_unassigned = default_is_unassigned 1115 for pname in list(props.keys()): # .keys() is a copy so we can del props[pname]. 1116 if props[pname] == _null_or_defaults[pname]: del props[pname] 1117 assert "blk" not in props 1118 # If there are no props left, then nothing will be printed. 1119 # Otherwise, add "unassigned" for more obvious output. 1120 if props and is_unassigned: 1121 props["unassigned"] = True 1122 i += 1 1123 # Return the _starts index of the first range after this block. 1124 return i 1125 1126 1127def CompactBlocks(): 1128 """Optimizes block properties. 1129 Sets properties on blocks to the most commonly used values, 1130 and removes default+block values from code point properties.""" 1131 # Ensure that there is a boundary in _starts for each block 1132 # so that the simple mixing method below works. 1133 for b in _blocks: 1134 AddBoundary(b[0]) 1135 limit = b[1] + 1 1136 if limit <= 0x10ffff: AddBoundary(limit) 1137 # Walk through ranges and blocks together. 1138 i = 0 1139 for b in _blocks: 1140 b_start = b[0] 1141 if _starts[i] < b_start: 1142 i = CompactNonBlock(b_start, i) 1143 i = CompactBlock(b, i) 1144 CompactNonBlock(0x110000, i) 1145 1146# Output ------------------------------------------------------------------- *** 1147 1148def AppendRange(fields, start, end): 1149 if start == end: 1150 fields.append("%04lX" % start) 1151 else: 1152 fields.append("%04lX..%04lX" % (start, end)) 1153 1154 1155def AppendProps(fields, props): 1156 # Sort property names (props keys) by their normalized forms 1157 # and output properties in that order. 1158 for pname in sorted(props, key=NormPropName): 1159 value = props[pname] 1160 if isinstance(value, bool): 1161 if not value: pname = "-" + pname 1162 fields.append(pname) 1163 else: 1164 fields.append("%s=%s" % (pname, value)) 1165 1166 1167def WriteFieldsRangeProps(fields, start, end, props, out_file): 1168 AppendRange(fields, start, end) 1169 AppendProps(fields, props) 1170 out_file.write(";".join(fields)) 1171 out_file.write("\n") 1172 1173 1174def EscapeNonASCII(s): 1175 i = 0 1176 while i < len(s): 1177 c = ord(s[i]) 1178 if c <= 0x7f: 1179 i = i + 1 1180 else: 1181 if c <= 0xffff: 1182 esc = u"\\u%04X" % c 1183 else: 1184 esc = u"\\U%08X" % c 1185 s = s[:i] + esc + s[i+1:] 1186 i = i + len(esc) 1187 return s 1188 1189 1190def WritePreparsedUCD(out_file): 1191 out_file.write("""# Preparsed UCD generated by ICU preparseucd.py 1192# Copyright (C) 1991 and later: Unicode, Inc. and others. 1193# License & terms of use: http://www.unicode.org/copyright.html 1194"""); 1195 out_file.write("ucd;%s\n\n" % _ucd_version) 1196 # Sort property names (props keys) by their normalized forms 1197 # and output properties in that order. 1198 pnames = sorted(_null_values, key=NormPropName) 1199 for pname in pnames: 1200 prop = _properties[pname] 1201 out_file.write(";".join(["property", prop[0]] + prop[1])) 1202 out_file.write("\n") 1203 out_file.write("\n") 1204 out_file.write(";".join(["binary"] + _binary_values["N"])) 1205 out_file.write("\n") 1206 out_file.write(";".join(["binary"] + _binary_values["Y"])) 1207 out_file.write("\n") 1208 for pname in pnames: 1209 prop = _properties[pname] 1210 short_names = prop[2] 1211 if short_names and prop[0] != "Binary": 1212 for name in sorted(short_names): 1213 out_file.write(";".join(["value", prop[1][0]] + prop[3][name])) 1214 out_file.write("\n") 1215 out_file.write("\n") 1216 # Ensure that there is a boundary in _starts for each 1217 # range of data we mix into the output, 1218 # so that the simple mixing method below works. 1219 for b in _blocks: AddBoundary(b[0]) 1220 for r in _alg_names_ranges: AddBoundary(r[0]) 1221 for h in _h1: AddBoundary(h[0]) 1222 for h in _h2: AddBoundary(h[0]) 1223 # Write the preparsed data. ppucd.txt = preparsed UCD 1224 # Syntax: http://site.icu-project.org/design/props/ppucd 1225 WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file) 1226 i_blocks = 0 1227 i_alg = 0 1228 i_h1 = 0 1229 i_h2 = 0 1230 b_end = -1 1231 for i in range(len(_starts) - 1): 1232 start = _starts[i] 1233 end = _starts[i + 1] - 1 1234 # Block with default properties. 1235 if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]: 1236 b = _blocks[i_blocks] 1237 b_end = b[1] 1238 WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file) 1239 i_blocks += 1 1240 # NamesList h1 heading (for [most of] a block). 1241 if i_h1 < len(_h1) and start == _h1[i_h1][0]: 1242 h = _h1[i_h1] 1243 out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2]))) 1244 i_h1 += 1 1245 # Algorithmic-names range. 1246 if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]: 1247 r = _alg_names_ranges[i_alg] 1248 fields = ["algnamesrange"] 1249 AppendRange(fields, r[0], r[1]) 1250 fields.extend(r[2:]) 1251 out_file.write(";".join(fields)) 1252 out_file.write("\n") 1253 i_alg += 1 1254 # NamesList h2 heading. 1255 if i_h2 < len(_h2) and start == _h2[i_h2][0]: 1256 out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1])) 1257 i_h2 += 1 1258 # Code point/range data. 1259 props = _props[i] 1260 # Omit ranges with only default+block properties. 1261 if props: 1262 if start > b_end and b_end >= 0: 1263 # First range with values after the last block. 1264 # Separate it visually from the block lines. 1265 out_file.write("\n# No block\n") 1266 b_end = -1 1267 if "unassigned" in props: 1268 # Do not output "unassigned" as a property. 1269 del props["unassigned"] 1270 line_type = "unassigned" 1271 else: 1272 line_type = "cp" 1273 WriteFieldsRangeProps([line_type], start, end, props, out_file) 1274 1275# Write Normalizer2 input files -------------------------------------------- *** 1276# Ported from gennorm/store.c. 1277 1278def WriteAllCC(out_file): 1279 out_file.write("# Canonical_Combining_Class (ccc) values\n"); 1280 prev_start = 0 1281 prev_cc = 0 1282 for i in range(len(_starts)): 1283 start = _starts[i] 1284 props = _props[i] 1285 cc = props.get("ccc") 1286 if not cc: cc = 0 1287 if prev_cc != cc: 1288 if prev_cc != 0: 1289 last_code_point = start - 1 1290 if prev_start == last_code_point: 1291 out_file.write("%04X:%d\n" % (last_code_point, prev_cc)) 1292 else: 1293 out_file.write("%04X..%04X:%d\n" % 1294 (prev_start, last_code_point, prev_cc)) 1295 prev_start = start 1296 prev_cc = cc 1297 1298 1299def HasMapping(c): 1300 props = GetProps(c) 1301 dt = props.get("dt") 1302 return dt and dt != "None" 1303 1304 1305def HasOneWayMapping(c): 1306 while True: 1307 props = GetProps(c) 1308 dt = props.get("dt") 1309 if not dt or dt == "None": 1310 return False # no mapping 1311 elif dt == "Can": 1312 # The canonical decomposition is a one-way mapping if 1313 # - it does not map to exactly two code points 1314 # - c has ccc!=0 1315 # - c has the Composition_Exclusion property 1316 # - its starter has a one-way mapping (loop for this) 1317 # - its non-starter decomposes 1318 nfd = props["dm"].split() 1319 if (len(nfd) != 2 or 1320 props.get("ccc") or 1321 props.get("Comp_Ex") or 1322 HasMapping(int(nfd[1], 16))): 1323 return True 1324 c = int(nfd[0], 16) # continue 1325 else: 1326 # c has a compatibility mapping. 1327 return True 1328 1329 1330_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others. 1331# License & terms of use: http://www.unicode.org/copyright.html 1332# Copyright (C) 1999-2016, International Business Machines 1333# Corporation and others. All Rights Reserved. 1334# 1335""" 1336 1337def WriteNorm2NFCTextFile(path): 1338 global _data_file_copyright 1339 with open(os.path.join(path, "nfc.txt"), "w") as out_file: 1340 out_file.write( 1341 _data_file_copyright + """# file name: nfc.txt 1342# 1343# machine-generated by ICU preparseucd.py 1344# 1345# Complete data for Unicode NFC normalization. 1346 1347* Unicode """ + _ucd_version + """ 1348 1349""") 1350 WriteAllCC(out_file) 1351 out_file.write("\n# Canonical decomposition mappings\n") 1352 for i in range(len(_starts) - 1): 1353 start = _starts[i] 1354 end = _starts[i + 1] - 1 1355 props = _props[i] 1356 dm = props.get("dm") 1357 if dm and dm[0] != '<' and props["dt"] == "Can": 1358 assert start == end 1359 # The Comp_Ex=Full_Composition_Exclusion property tells us 1360 # whether the canonical decomposition round-trips. 1361 separator = '>' if props.get("Comp_Ex") else '=' 1362 out_file.write("%04X%s%s\n" % (start, separator, dm)) 1363 1364 1365def WriteNorm2NFKCTextFile(path): 1366 global _data_file_copyright 1367 with open(os.path.join(path, "nfkc.txt"), "w") as out_file: 1368 out_file.write( 1369 _data_file_copyright + """# file name: nfkc.txt 1370# 1371# machine-generated by ICU preparseucd.py 1372# 1373# Data for Unicode NFKC normalization. 1374# This file contains only compatibility decomposition mappings, 1375# plus those canonical decompositions that change from NFC round-trip mappings 1376# to NFKC one-way mappings. 1377# Use this file as the second gennorm2 input file after nfc.txt. 1378 1379* Unicode """ + _ucd_version + """ 1380 1381""") 1382 for i in range(len(_starts) - 1): 1383 start = _starts[i] 1384 end = _starts[i + 1] - 1 1385 props = _props[i] 1386 dm = props.get("dm") 1387 if dm and dm[0] != '<': 1388 assert start == end 1389 if props["dt"] != "Can": 1390 # Compatibility decomposition. 1391 out_file.write("%04X>%s\n" % (start, dm)) 1392 elif not props.get("Comp_Ex") and HasOneWayMapping(start): 1393 # NFC round-trip mapping turns into NFKC one-way mapping. 1394 out_file.write("%04X>%s # NFC round-trip, NFKC one-way\n" % 1395 (start, dm)) 1396 1397 1398def WriteNorm2NFKC_CFTextFile(path): 1399 global _data_file_copyright 1400 with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file: 1401 out_file.write( 1402 _data_file_copyright + """# file name: nfkc_cf.txt 1403# 1404# machine-generated by ICU preparseucd.py 1405# 1406# This file contains the Unicode NFKC_CF mappings, 1407# extracted from the UCD file DerivedNormalizationProps.txt, 1408# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool. 1409# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt. 1410 1411""") 1412 out_file.write("* Unicode " + _ucd_version + "\n\n") 1413 prev_start = 0 1414 prev_end = 0 1415 prev_nfkc_cf = None 1416 for i in range(len(_starts) - 1): 1417 start = _starts[i] 1418 end = _starts[i + 1] - 1 1419 props = _props[i] 1420 nfkc_cf = props.get("NFKC_CF") 1421 # Merge with the previous range if possible, 1422 # or remember this range for merging. 1423 if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start: 1424 prev_end = end 1425 else: 1426 if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'): 1427 if prev_start == prev_end: 1428 out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf)) 1429 else: 1430 out_file.write("%04X..%04X>%s\n" % 1431 (prev_start, prev_end, prev_nfkc_cf)) 1432 prev_start = start 1433 prev_end = end 1434 prev_nfkc_cf = nfkc_cf 1435 1436 1437def WriteNorm2(path): 1438 WriteNorm2NFCTextFile(path) 1439 WriteNorm2NFKCTextFile(path) 1440 WriteNorm2NFKC_CFTextFile(path) 1441 1442# UTS #46 Normalizer2 input file ------------------------------------------- *** 1443 1444_idna_replacements = [ 1445 # Several versions of avoiding circular FFFD>FFFD mappings, 1446 # depending on the version of the input file. 1447 (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"), 1448 (re.compile(r"\.\.FFFD"), "..FFFC"), 1449 (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), 1450 # Since we switch between checking and not checking for STD3 character 1451 # restrictions at runtime, checking the non-LDH ASCII characters in code, 1452 # we treat these values here like their regular siblings. 1453 (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"), 1454 (re.compile(r"; disallowed_STD3_mapped +; "), ">"), 1455 # For UTS #46, we do not care about "not valid in IDNA2008". 1456 (re.compile(r"; *; NV8 +"), ""), 1457 # ICU 63+ normalization no longer allows mappings for surrogate code points, 1458 # and the UTS #46 code handles them instead. 1459 (re.compile(r"^D800..DFFF ; disallowed"), r"# D800..DFFF disallowed in code"), 1460 # Normal transformations. 1461 (re.compile(r"; disallowed"), ">FFFD"), 1462 (re.compile(r"; ignored"), ">"), 1463 (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), 1464 (re.compile(r"; mapped +; "), ">"), 1465 (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >") 1466] 1467 1468def IdnaToUTS46TextFile(s, t): 1469 """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format.""" 1470 # Different input/output file names. 1471 dest_path = os.path.dirname(t) 1472 t = os.path.join(dest_path, "uts46.txt") 1473 # TODO: With Python 2.7+, combine the two with statements into one. 1474 with open(s, "r") as in_file: 1475 with open(t, "w") as out_file: 1476 out_file.write("# Original file:\n") 1477 for line in in_file: 1478 orig_line = line 1479 if line.startswith("# For documentation"): 1480 out_file.write(line) 1481 out_file.write(r""" 1482# ================================================ 1483# This file has been reformatted into syntax for the 1484# gennorm2 Normalizer2 data generator tool. 1485# 1486# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out. 1487# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax. 1488# "disallowed" lines map to U+FFFD. 1489# "ignored" lines map to an empty string. 1490# 1491# Characters disallowed under STD3 rules are treated as valid or mapped; 1492# they are handled in code. 1493# Deviation characters are also handled in code. 1494# 1495# Use this file as the second gennorm2 input file after nfc.txt. 1496# ================================================ 1497""") 1498 continue 1499 if line[0] in "#\r\n": 1500 out_file.write(line) 1501 continue 1502 for rep in _idna_replacements: line = rep[0].sub(rep[1], line) 1503 # Align inline comments at column 40. 1504 comment_pos = line.find("#", 1) 1505 if comment_pos < 40: 1506 line = (line[:comment_pos] + ((40 - comment_pos) * ' ') + 1507 line[comment_pos:]) 1508 elif comment_pos > 40: 1509 space_pos = comment_pos 1510 while space_pos > 0 and line[space_pos - 1] == ' ': 1511 space_pos = space_pos - 1 1512 if space_pos < 40: 1513 # Fewer than 40 characters before the comment: 1514 # Align comments at column 40. 1515 line = line[:40] + line[comment_pos:] 1516 else: 1517 # 40 or more characters before the comment: 1518 # Keep one space between contents and comment. 1519 line = line[:space_pos] + " " + line[comment_pos:] 1520 # Write the modified line. 1521 out_file.write(line) 1522 if "..FFFF" in orig_line and "..FFFC" in line: 1523 out_file.write("FFFE..FFFF >FFFD\n"); 1524 return t 1525 1526# Preprocessing ------------------------------------------------------------ *** 1527 1528_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*") 1529_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 1530 1531def CopyAndStripWithOptionalMerge(s, t, do_merge): 1532 # TODO: We do not seem to need the do_merge argument and logic any more. 1533 with open(s, "r") as in_file, open(t, "w") as out_file: 1534 first = -1 # First code point with first_data. 1535 last = -1 # Last code point with first_data. 1536 first_data = "" # Common data for code points [first..last]. 1537 for line in in_file: 1538 match = _strip_re.match(line) 1539 if match: 1540 line = match.group(1) 1541 else: 1542 line = line.rstrip() 1543 if do_merge: 1544 match = _code_point_re.match(line) 1545 if match: 1546 c = int(match.group(1), 16) 1547 data = line[match.end() - 1:] 1548 else: 1549 c = -1 1550 data = "" 1551 if last >= 0 and (c != (last + 1) or data != first_data): 1552 # output the current range 1553 if first == last: 1554 out_file.write("%04X%s\n" % (first, first_data)) 1555 else: 1556 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 1557 first = -1 1558 last = -1 1559 first_data = "" 1560 if c < 0: 1561 # no data on this line, output as is 1562 out_file.write(line) 1563 out_file.write("\n") 1564 else: 1565 # data on this line, store for possible range compaction 1566 if last < 0: 1567 # set as the first line in a possible range 1568 first = c 1569 last = c 1570 first_data = data 1571 else: 1572 # must be c == (last + 1) and data == first_data 1573 # because of previous conditions 1574 # continue with the current range 1575 last = c 1576 else: 1577 # Only strip, don't merge: just output the stripped line. 1578 out_file.write(line) 1579 out_file.write("\n") 1580 if do_merge and last >= 0: 1581 # output the last range in the file 1582 if first == last: 1583 out_file.write("%04X%s\n" % (first, first_data)) 1584 else: 1585 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 1586 first = -1 1587 last = -1 1588 first_data = "" 1589 out_file.flush() 1590 return t 1591 1592 1593def CopyAndStrip(s, t): 1594 """Copies a file and removes comments behind data lines but not in others.""" 1595 return CopyAndStripWithOptionalMerge(s, t, False) 1596 1597 1598def CopyAndStripAndMerge(s, t): 1599 """Copies and strips a file and merges lines. 1600 1601 Copies a file, removes comments, and 1602 merges lines with adjacent code point ranges and identical per-code point 1603 data lines into one line with range syntax. 1604 """ 1605 return CopyAndStripWithOptionalMerge(s, t, True) 1606 1607 1608def CopyOnly(s, t): 1609 shutil.copy(s, t) 1610 return t 1611 1612 1613def DontCopy(s, t): 1614 return s 1615 1616 1617# Each _files value is a 1618# (preprocessor, dest_folder, parser, order) tuple 1619# where all fields except the preprocessor are optional. 1620# After the initial preprocessing (copy/strip/merge), 1621# if a parser is specified, then a tuple is added to _files_to_parse 1622# at index "order" (default order 9). 1623# An explicit order number is set only for files that must be parsed 1624# before others. 1625_files = { 1626 "BidiBrackets.txt": (DontCopy, ParseBidiBrackets), 1627 "BidiMirroring.txt": (DontCopy, ParseBidiMirroring), 1628 "BidiTest.txt": (CopyOnly, "testdata"), 1629 "Blocks.txt": (DontCopy, ParseBlocks), 1630 "CaseFolding.txt": (CopyOnly, ParseCaseFolding), 1631 "DerivedAge.txt": (DontCopy, ParseDerivedAge), 1632 "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass), 1633 "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties), 1634 "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup), 1635 "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType), 1636 "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties), 1637 "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues), 1638 "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth), 1639 "emoji-data.txt": (DontCopy, ParseNamedProperties), 1640 "emoji-sequences.txt": (CopyOnly,), 1641 "emoji-zwj-sequences.txt": (CopyOnly,), 1642 "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty), 1643 "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"), 1644 "IdnaTestV2.txt": (CopyOnly, "testdata"), 1645 "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory), 1646 "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory), 1647 "LineBreak.txt": (DontCopy, ParseLineBreak), 1648 "LineBreakTest.txt": (CopyOnly, "testdata"), 1649 "NameAliases.txt": (DontCopy, ParseNameAliases), 1650 "NamesList.txt": (DontCopy, ParseNamesList), 1651 "NormalizationCorrections.txt": (CopyOnly,), # Only used in gensprep. 1652 "NormalizationTest.txt": (CopyAndStrip,), 1653 "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0), 1654 "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1), 1655 "PropList.txt": (DontCopy, ParseNamedProperties), 1656 "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak), 1657 "SentenceBreakTest.txt": (CopyOnly, "testdata"), 1658 "Scripts.txt": (DontCopy, ParseScripts), 1659 "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions), 1660 "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing), 1661 "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2), 1662 "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation), 1663 "WordBreakProperty.txt": (DontCopy, ParseWordBreak), 1664 "WordBreakTest.txt": (CopyOnly, "testdata"), 1665 # From www.unicode.org/Public/idna/<version>/ 1666 "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2") 1667} 1668 1669# List of lists of files to be parsed in order. 1670# Inner lists contain (basename, path, parser) tuples. 1671_files_to_parse = [[], [], [], [], [], [], [], [], [], []] 1672 1673# Get the standard basename from a versioned filename. 1674# For example, match "UnicodeData-6.1.0d8.txt" 1675# so we can turn it into "UnicodeData.txt". 1676_file_version_re = re.compile("([a-zA-Z0-9_-]+)" + 1677 "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" + 1678 "(\\.[a-z]+)$") 1679 1680def PreprocessFiles(source_files, icu4c_src_root): 1681 unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") 1682 norm2_path = os.path.join(unidata_path, "norm2") 1683 testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata") 1684 folder_to_path = { 1685 "unidata": unidata_path, 1686 "norm2": norm2_path, 1687 "testdata": testdata_path 1688 } 1689 files_processed = set() 1690 for source_file in source_files: 1691 (folder, basename) = os.path.split(source_file) 1692 match = _file_version_re.match(basename) 1693 if match: 1694 new_basename = match.group(1) + match.group(2) 1695 if new_basename != basename: 1696 print("Removing version suffix from " + source_file) 1697 # ... so that we can easily compare UCD files. 1698 new_source_file = os.path.join(folder, new_basename) 1699 shutil.move(source_file, new_source_file) 1700 basename = new_basename 1701 source_file = new_source_file 1702 if basename in _files: 1703 print("Preprocessing %s" % basename) 1704 if basename in files_processed: 1705 raise Exception("duplicate file basename %s!" % basename) 1706 files_processed.add(basename) 1707 value = _files[basename] 1708 preprocessor = value[0] 1709 if len(value) >= 2 and isinstance(value[1], (str)): 1710 # The value was [preprocessor, dest_folder, ...], leave [...]. 1711 dest_folder = value[1] 1712 value = value[2:] 1713 else: 1714 # The value was [preprocessor, ...], leave [...]. 1715 dest_folder = "unidata" 1716 value = value[1:] 1717 dest_path = folder_to_path[dest_folder] 1718 if not os.path.exists(dest_path): os.makedirs(dest_path) 1719 dest_basename = basename 1720 # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt. 1721 if basename.endswith("-cldr.txt"): 1722 dest_basename = basename[:-9] + basename[-4:] 1723 dest_file = os.path.join(dest_path, dest_basename) 1724 parse_file = preprocessor(source_file, dest_file) 1725 if value: 1726 order = 9 if len(value) < 2 else value[1] 1727 _files_to_parse[order].append((basename, parse_file, value[0])) 1728 1729# Character names ---------------------------------------------------------- *** 1730 1731# TODO: Turn this script into a module that 1732# a) gives access to the parsed data 1733# b) has a PreparseUCD(ucd_root, icu4c_src_root) function 1734# c) has a ParsePreparsedUCD(filename) function 1735# d) has a WritePreparsedUCD(filename) function 1736# and then use it from a new script for names. 1737# Some more API: 1738# - generator GetRangesAndProps() -> (start, end, props)* 1739 1740def IncCounter(counters, key, inc=1): 1741 if key in counters: 1742 counters[key] += inc 1743 else: 1744 counters[key] = inc 1745 1746 1747endings = ( 1748 # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz. 1749 "PHASE-", 1750 "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ", 1751 "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ", 1752 "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ", 1753 "ACROPHONIC ", "HIEROGLYPH ", 1754 "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ", 1755 "PUNCTUATION ", "SIGN ", "SYMBOL ", 1756 "TILE ", "CARD ", "FACE ", 1757 "ACCENT ", "POINT ", 1758 # List SIGN before VOWEL to catch "vowel sign". 1759 "VOWEL ", "TONE ", "RADICAL ", 1760 # For names of math symbols, 1761 # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A 1762 "SCRIPT ", "FRAKTUR ", "MONOSPACE ", 1763 "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ", 1764 "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ", 1765 # BRAILLE PATTERN DOTS-xyz 1766 "DOTS-", 1767 "SELECTOR ", "SELECTOR-" 1768) 1769 1770def SplitName(name, tokens): 1771 start = 0 1772 for e in endings: 1773 i = name.find(e) 1774 if i >= 0: 1775 start = i + len(e) 1776 token = name[:start] 1777 IncCounter(tokens, token) 1778 break 1779 for i in range(start, len(name)): 1780 c = name[i] 1781 if c == ' ' or c == '-': 1782 token = name[start:i + 1] 1783 IncCounter(tokens, token) 1784 start = i + 1 1785 IncCounter(tokens, name[start:]) 1786 1787 1788def PrintNameStats(): 1789 # TODO: This name analysis code is out of date. 1790 # It needs to consider the multi-type Name_Alias values. 1791 name_pnames = ("na", "na1", "Name_Alias") 1792 counts = {} 1793 for pname in name_pnames: 1794 counts[pname] = 0 1795 total_lengths = counts.copy() 1796 max_length = 0 1797 max_per_cp = 0 1798 name_chars = set() 1799 num_digits = 0 1800 token_counters = {} 1801 char_counters = {} 1802 for i in range(len(_starts) - 1): 1803 start = _starts[i] 1804 # end = _starts[i + 1] - 1 1805 props = _props[i] 1806 per_cp = 0 1807 for pname in name_pnames: 1808 if pname in props: 1809 counts[pname] += 1 1810 name = props[pname] 1811 total_lengths[pname] += len(name) 1812 name_chars |= set(name) 1813 if len(name) > max_length: max_length = len(name) 1814 per_cp += len(name) + 1 1815 if per_cp > max_per_cp: max_per_cp = per_cp 1816 tokens = SplitName(name, token_counters) 1817 for c in name: 1818 if c in "0123456789": num_digits += 1 1819 IncCounter(char_counters, c) 1820 print 1821 for pname in name_pnames: 1822 print("'%s' character names: %d / %d bytes" % 1823 (pname, counts[pname], total_lengths[pname])) 1824 print("%d total bytes in character names" % sum(total_lengths.itervalues())) 1825 print("%d name-characters: %s" % 1826 (len(name_chars), "".join(sorted(name_chars)))) 1827 print("%d digits 0-9" % num_digits) 1828 count_chars = [(count, c) for (c, count) in char_counters.items()] 1829 count_chars.sort(reverse=True) 1830 for cc in count_chars: 1831 print("name-chars: %6d * '%s'" % cc) 1832 print("max. name length: %d" % max_length) 1833 print("max. length of all (names+NUL) per cp: %d" % max_per_cp) 1834 1835 token_lengths = sum([len(t) + 1 for t in token_counters]) 1836 print("%d total tokens, %d bytes with NUL" % 1837 (len(token_counters), token_lengths)) 1838 1839 counts_tokens = [] 1840 for (token, count) in token_counters.items(): 1841 # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time 1842 # but have to store the token string itself with a length or terminator byte, 1843 # plus a 2-byte entry in an token index table. 1844 savings = count * (len(token) - 1) - (len(token) + 1 + 2) 1845 if savings > 0: 1846 counts_tokens.append((savings, count, token)) 1847 counts_tokens.sort(reverse=True) 1848 print("%d tokens might save space with 1-byte codes" % len(counts_tokens)) 1849 1850 # Codes=bytes, 40 byte values for name_chars. 1851 # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens. 1852 # Make each 2-byte token the token string index itself, rather than 1853 # and index into a string index table. 1854 # More lead bytes but also more savings. 1855 num_units = 256 1856 max_lead = (token_lengths + 255) / 256 1857 max_token_units = num_units - len(name_chars) 1858 results = [] 1859 for num_lead in range(min(max_lead, max_token_units) + 1): 1860 max1 = max_token_units - num_lead 1861 ct = counts_tokens[:max1] 1862 tokens1 = set([t for (s, c, t) in ct]) 1863 for (token, count) in token_counters.items(): 1864 if token in tokens1: continue 1865 # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time 1866 # but have to store the token string itself with a length or terminator byte. 1867 savings = count * (len(token) - 2) - (len(token) + 1) 1868 if savings > 0: 1869 ct.append((savings, count, token)) 1870 ct.sort(reverse=True) 1871 # A 2-byte-code-token index cannot be limit_t_lengths or higher. 1872 limit_t_lengths = num_lead * 256 1873 token2_index = 0 1874 for i in range(max1, len(ct)): 1875 if token2_index >= limit_t_lengths: 1876 del ct[i:] 1877 break 1878 token2_index += len(ct[i][2]) + 1 1879 cumul_savings = sum([s for (s, c, t) in ct]) 1880 # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" % 1881 # (max1, len(ct), cumul_savings)) 1882 results.append((cumul_savings, max1, ct)) 1883 best = max(results) # (cumul_savings, max1, ct) 1884 1885 max1 = best[1] 1886 print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" % 1887 (best[0], max1, max_token_units - max1)) 1888 counts_tokens = best[2] 1889 cumul_savings = 0 1890 for i in range(len(counts_tokens)): 1891 n = 1 if i < max1 else 2 1892 i1 = i + 1 1893 t = counts_tokens[i] 1894 cumul_savings += t[0] 1895 if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens): 1896 print(("%04d. cumul. %6d bytes save %6d bytes from " + 1897 "%5d * %d-byte token for %2d='%s'") % 1898 (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2])) 1899 1900# ICU API ------------------------------------------------------------------ *** 1901 1902# Sample line to match: 1903# UCHAR_UNIFIED_IDEOGRAPH=29, 1904_uchar_re = re.compile( 1905 " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),") 1906 1907# Sample line to match: 1908# /** Zs @stable ICU 2.0 */ 1909_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ") 1910 1911# Sample line to match: 1912# U_SPACE_SEPARATOR = 12, 1913_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") 1914 1915# Sample line to match: 1916# /** L @stable ICU 2.0 */ 1917_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ") 1918 1919# Sample line to match: 1920# U_LEFT_TO_RIGHT = 0, 1921_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") 1922 1923# Sample line to match: 1924# UBLOCK_CYRILLIC =9, 1925_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,") 1926 1927# Sample line to match: 1928# U_EA_AMBIGUOUS, 1929_prop_and_value_re = re.compile( 1930 " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))") 1931 1932# Sample line to match if it has matched _prop_and_value_re 1933# (we want to exclude aliases): 1934# U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, 1935_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U") 1936 1937def ParseUCharHeader(icu4c_src_root): 1938 uchar_path = os.path.join(icu4c_src_root, "source", 1939 "common", "unicode", "uchar.h") 1940 with open(uchar_path, "r") as uchar_file: 1941 mode = "" # Mode string (=pname) during context-sensitive parsing. 1942 comment_value = "" # Property value from a comment preceding an enum. 1943 # Note: The enum UProperty is first in uchar.h, before the enums for values. 1944 for line in uchar_file: 1945 # Parse some enums via context-sensitive "modes". 1946 # Necessary because the enum constant names do not contain 1947 # enough information. 1948 if "enum UCharCategory" in line: 1949 mode = "gc" 1950 comment_value = "" 1951 continue 1952 if mode == "gc": 1953 if line.startswith("}"): 1954 mode = "" 1955 continue 1956 match = _gc_comment_re.match(line) 1957 if match: 1958 comment_value = match.group(1) 1959 continue 1960 match = _gc_re.match(line) 1961 if match and comment_value: 1962 gc_enum = match.group(1) 1963 prop = _properties["gc"] 1964 vname = GetShortPropertyValueName(prop, comment_value) 1965 icu_values = _pname_to_icu_prop["gc"][2] 1966 icu_values.append((gc_enum, vname)) 1967 comment_value = "" 1968 continue 1969 if "enum UCharDirection {" in line: 1970 mode = "bc" 1971 comment_value = "" 1972 continue 1973 if mode == "bc": 1974 if line.startswith("}"): 1975 mode = "" 1976 continue 1977 match = _bc_comment_re.match(line) 1978 if match: 1979 comment_value = match.group(1) 1980 continue 1981 match = _bc_re.match(line) 1982 if match and comment_value: 1983 bc_enum = match.group(1) 1984 prop = _properties["bc"] 1985 vname = GetShortPropertyValueName(prop, comment_value) 1986 icu_values = _pname_to_icu_prop["bc"][2] 1987 icu_values.append((bc_enum, vname)) 1988 comment_value = "" 1989 continue 1990 # No mode, parse enum constants whose names contain 1991 # enough information to parse without requiring context. 1992 match = _uchar_re.match(line) 1993 if match: 1994 prop_enum = match.group(1) 1995 if prop_enum.endswith("_LIMIT"): 1996 # Ignore "UCHAR_BINARY_LIMIT=57," etc. 1997 continue 1998 pname = GetShortPropertyName(prop_enum[6:]) 1999 icu_prop = (prop_enum, pname, []) 2000 _icu_properties.append(icu_prop) 2001 _pname_to_icu_prop[pname] = icu_prop 2002 continue 2003 match = _ublock_re.match(line) 2004 if match: 2005 prop_enum = match.group(1) 2006 if prop_enum == "UBLOCK_COUNT": 2007 continue 2008 prop = _properties["blk"] 2009 vname = GetShortPropertyValueName(prop, prop_enum[7:]) 2010 icu_values = _pname_to_icu_prop["blk"][2] 2011 icu_values.append((prop_enum, vname)) 2012 continue 2013 match = _prop_and_value_re.match(line) 2014 if match: 2015 (prop_enum, vname) = match.group(1, 3) 2016 if vname == "COUNT" or _prop_and_alias_re.match(line): 2017 continue 2018 pname = GetShortPropertyName(match.group(2)) 2019 prop = _properties[pname] 2020 vname = GetShortPropertyValueName(prop, vname) 2021 icu_values = _pname_to_icu_prop[pname][2] 2022 icu_values.append((prop_enum, vname)) 2023 # ccc, lccc, tccc use their numeric values as "enum" values. 2024 # In the UCD data, these numeric values are the first value names, 2025 # followed by the short & long value names. 2026 # List the ccc values in numeric order. 2027 prop = _properties["ccc"] 2028 icu_values = _pname_to_icu_prop["ccc"][2] 2029 for ccc in sorted([int(name) for name in prop[2]]): 2030 icu_values.append((ccc, str(ccc))) 2031 _pname_to_icu_prop["lccc"][2].extend(icu_values) # Copy ccc -> lccc. 2032 _pname_to_icu_prop["tccc"][2].extend(icu_values) # Copy ccc -> tccc. 2033 2034 # No need to parse predictable General_Category_Mask enum constants. 2035 # Just define them in ASCII order. 2036 prop = _properties["gcm"] 2037 icu_values = _pname_to_icu_prop["gcm"][2] 2038 for vname in sorted(prop[2]): 2039 icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname)) 2040 # Hardcode known values for the normalization quick check properties, 2041 # see unorm2.h for the UNormalizationCheckResult enum. 2042 icu_values = _pname_to_icu_prop["NFC_QC"][2] 2043 icu_values.append(("UNORM_NO", "N")) 2044 icu_values.append(("UNORM_YES", "Y")) 2045 icu_values.append(("UNORM_MAYBE", "M")) 2046 _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values) # Copy NFC -> NFKC. 2047 # No "maybe" values for NF[K]D. 2048 icu_values = _pname_to_icu_prop["NFD_QC"][2] 2049 icu_values.append(("UNORM_NO", "N")) 2050 icu_values.append(("UNORM_YES", "Y")) 2051 _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values) # Copy NFD -> NFKD. 2052 2053 2054# Sample line to match: 2055# USCRIPT_LOMA = 139,/* Loma */ 2056_uscript_re = re.compile( 2057 " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/") 2058 2059def ParseUScriptHeader(icu4c_src_root): 2060 uscript_path = os.path.join(icu4c_src_root, "source", 2061 "common", "unicode", "uscript.h") 2062 icu_values = _pname_to_icu_prop["sc"][2] 2063 with open(uscript_path, "r") as uscript_file: 2064 for line in uscript_file: 2065 match = _uscript_re.match(line) 2066 if match: 2067 (script_enum, script_code) = match.group(1, 2) 2068 icu_values.append((script_enum, script_code)) 2069 2070 2071def CheckPNamesData(): 2072 """Checks that every ICU property has a full set of value enum constants, 2073 and that the _icu_properties value names map back to the UCD.""" 2074 missing_enums = [] 2075 for (p_enum, pname, values) in _icu_properties: 2076 prop = _properties[pname] 2077 vnames = set(prop[2]) # Modifiable copy of the set of short value names. 2078 for (v_enum, vname) in values: 2079 if vname not in vnames: 2080 raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" % 2081 (pname, vname, v_enum)) 2082 vnames.remove(vname) 2083 # Exceptions to the all-values check: 2084 # - ICU does not have specific enum values for binary No/Yes. 2085 # - ICU represents Age values via UVersionInfo rather than enum constants. 2086 # - gc: ICU enum UCharCategory only has the single-category values. 2087 # (ICU's gcm property has all of the UCD gc property values.) 2088 if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")): 2089 missing_enums.append((pname, vnames)) 2090 if missing_enums: 2091 raise ValueError( 2092 "missing uchar.h enum constants for some property values: %s" % 2093 missing_enums) 2094 2095 2096def WritePNamesDataHeader(out_path): 2097 with open(out_path, "w") as out_file: 2098 out_file.write("""// © 2016 and later: Unicode, Inc. and others. 2099// License & terms of use: http://www.unicode.org/copyright.html 2100/** 2101 * Copyright (C) 2002-2016, International Business Machines Corporation and 2102 * others. All Rights Reserved. 2103 * 2104 * machine-generated by: icu/tools/unicode/py/preparseucd.py 2105 */ 2106 2107""") 2108 2109 # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties 2110 # and values in the order of their definition, 2111 # and this function writes them in that order. 2112 # Since the ICU API constants are stable and new values are only 2113 # appended at the end 2114 # (new properties are added at the end of each binary/enum/... range), 2115 # the output is stable as well. 2116 # When a property or value constant is renamed, 2117 # it only changes the name itself in the output; 2118 # it does not move in the output since there is no sorting. 2119 # This minimizes diffs and assists with reviewing and evaluating updates. 2120 2121 version = _ucd_version.split('.') 2122 while len(version) < 4: version.append("0") 2123 out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version)) 2124 2125 # Count the maximum number of aliases for any property or value. 2126 # We write the final value at the end. 2127 max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"])) 2128 2129 # Write an array of "binprop" Value object initializers 2130 # with the value aliases shared among all binary properties. 2131 out_file.write("static const Value VALUES_binprop[2] = {\n") 2132 out_file.write(' Value(0, "%s"),\n' % " ".join(_binary_values["N"])) 2133 out_file.write(' Value(1, "%s"),\n' % " ".join(_binary_values["Y"])) 2134 out_file.write("};\n\n") 2135 2136 # For each property with named values, write an array of 2137 # Value object initializers with the value enum and the aliases. 2138 for (p_enum, pname, values) in _icu_properties: 2139 prop = _properties[pname] 2140 aliases = prop[1] 2141 if len(aliases) > max_aliases: max_aliases = len(aliases) 2142 if not values: continue 2143 out_file.write("static const Value VALUES_%s[%d] = {\n" % 2144 (pname, len(values))) 2145 for (v_enum, vname) in values: 2146 aliases = _properties[pname][3][vname] 2147 # ccc, lccc, tccc: Omit the numeric strings from the aliases. 2148 # (See the comment about ccc in the PropertyValueAliases.txt header.) 2149 if pname.endswith("ccc"): aliases = aliases[1:] 2150 if len(aliases) > max_aliases: max_aliases = len(aliases) 2151 cast = "(int32_t)" if pname == "gcm" else "" 2152 out_file.write(' Value(%s%s, "%s"),\n' % 2153 (cast, v_enum, " ".join(aliases))) 2154 out_file.write("};\n\n") 2155 2156 # For each property, write a Property object initializer 2157 # with the property enum, its aliases, and a reference to its values. 2158 out_file.write("static const Property PROPERTIES[%d] = {\n" % 2159 len(_icu_properties)) 2160 for (enum, pname, values) in _icu_properties: 2161 prop = _properties[pname] 2162 aliases = " ".join(prop[1]) 2163 if prop[0] == "Binary": 2164 out_file.write(' Property(%s, "%s"),\n' % (enum, aliases)) 2165 elif values: # Property with named values. 2166 out_file.write(' Property(%s, "%s", VALUES_%s, %d),\n' % 2167 (enum, aliases, pname, len(values))) 2168 else: 2169 out_file.write(' Property(%s, "%s"),\n' % (enum, aliases)) 2170 out_file.write("};\n\n") 2171 2172 out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases) 2173 2174# main() ------------------------------------------------------------------- *** 2175 2176def main(): 2177 global _null_or_defaults 2178 only_ppucd = False 2179 if len(sys.argv) == 3: 2180 (ucd_root, icu_src_root) = sys.argv[1:3] 2181 ppucd_path = None 2182 elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd": 2183 # For debugging: 2184 # preparseucd.py path/to/UCD/root --only_ppucd path/to/ppucd/outputfile 2185 ucd_root = sys.argv[1] 2186 ppucd_path = sys.argv[3] 2187 only_ppucd = True 2188 icu_src_root = "/tmp/ppucd" 2189 else: 2190 print("Usage: %s path/to/UCD/root path/to/ICU/src/root" % sys.argv[0]) 2191 return 2192 icu4c_src_root = os.path.join(icu_src_root, "icu4c") 2193 icu_tools_root = os.path.join(icu_src_root, "tools") 2194 source_files = [] 2195 for root, dirs, files in os.walk(ucd_root): 2196 for file in files: 2197 source_files.append(os.path.join(root, file)) 2198 PreprocessFiles(source_files, icu4c_src_root) 2199 # Parse the processed files in a particular order. 2200 for files in _files_to_parse: 2201 for (basename, path, parser) in files: 2202 print("Parsing %s" % basename) 2203 value = _files[basename] 2204 # Unicode data files are in UTF-8. 2205 charset = "UTF-8" 2206 if basename == "NamesList.txt": 2207 # The NamesList used to be in Latin-1 before Unicode 6.2. 2208 numeric_ucd_version = [int(field) for field in _ucd_version.split('.')] 2209 if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1" 2210 in_file = codecs.open(path, "r", charset) 2211 with in_file: 2212 parser(in_file) 2213 _null_or_defaults = _null_values.copy() 2214 _null_or_defaults.update(_defaults) 2215 # Every Catalog and Enumerated property must have a default value, 2216 # from a @missing line. "nv" = "null value". 2217 pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"] 2218 if pnv: 2219 raise Exception("no default values (@missing lines) for " + 2220 "some Catalog or Enumerated properties: %s " % pnv) 2221 unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") 2222 if not only_ppucd: 2223 # Write Normalizer2 input text files. 2224 # Do this before compacting the data so that we need not handle fallbacks. 2225 norm2_path = os.path.join(unidata_path, "norm2") 2226 if not os.path.exists(norm2_path): os.makedirs(norm2_path) 2227 WriteNorm2(norm2_path) 2228 # Optimize block vs. cp properties. 2229 CompactBlocks() 2230 # Write the ppucd.txt output file. 2231 # Use US-ASCII so that ICU tests can parse it in the platform charset, 2232 # which may be EBCDIC. 2233 # Fix up non-ASCII data (NamesList.txt headings) to fit. 2234 if not ppucd_path: 2235 ppucd_path = os.path.join(unidata_path, "ppucd.txt") 2236 with codecs.open(ppucd_path, "w", "US-ASCII") as out_file: 2237 WritePreparsedUCD(out_file) 2238 out_file.flush() 2239 2240 # TODO: PrintNameStats() 2241 2242 if only_ppucd: return 2243 2244 # ICU data for property & value names API 2245 ParseUCharHeader(icu4c_src_root) 2246 ParseUScriptHeader(icu4c_src_root) 2247 CheckPNamesData() 2248 genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops") 2249 if not os.path.exists(genprops_path): os.makedirs(genprops_path) 2250 out_path = os.path.join(genprops_path, "pnames_data.h") 2251 WritePNamesDataHeader(out_path) 2252 2253 2254if __name__ == "__main__": 2255 main() 2256