1#!/usr/bin/python -B 2# -*- coding: utf-8 -*- 3# © 2016 and later: Unicode, Inc. and others. 4# License & terms of use: http://www.unicode.org/copyright.html 5# Copyright (c) 2009-2016 International Business Machines 6# Corporation and others. All Rights Reserved. 7# 8# file name: preparseucd.py 9# encoding: US-ASCII 10# tab size: 8 (not used) 11# indentation:4 12# 13# created on: 2011nov03 (forked from ucdcopy.py) 14# created by: Markus W. Scherer 15# 16# Copies Unicode Character Database (UCD) files from a tree 17# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/ 18# to ICU's source/data/unidata/ and source/test/testdata/ 19# and modifies some of the files to make them more compact. 20# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax. 21# 22# Invoke with two command-line parameters: 23# 1. source folder with UCD & idna files 24# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools) 25# 26# Sample invocation: 27# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src 28 29import array 30import bisect 31import codecs 32import os 33import os.path 34import re 35import shutil 36import sys 37 38# Unicode version ---------------------------------------------------------- *** 39 40_ucd_version = "?" 41 42# ISO 15924 script codes --------------------------------------------------- *** 43 44# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html 45# that are not yet in the UCD. 46_scripts_only_in_iso15924 = ( 47 "Afak", "Blis", "Cirt", "Cyrs", 48 "Egyd", "Egyh", "Geok", 49 "Hanb", "Hans", "Hant", 50 "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma", 51 "Maya", "Moon", "Nkgb", "Phlv", "Roro", 52 "Sara", "Syre", "Syrj", "Syrn", 53 "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx" 54) 55 56# Properties --------------------------------------------------------------- *** 57 58# Properties that we do not want to store in ppucd.txt. 59# Not a frozenset so that we can add aliases for simpler subsequent testing. 60_ignored_properties = set(( 61 # Other_Xyz only contribute to Xyz, store only the latter. 62 "OAlpha", 63 "ODI", 64 "OGr_Ext", 65 "OIDC", 66 "OIDS", 67 "OLower", 68 "OMath", 69 "OUpper", 70 # Further properties that just contribute to others. 71 "CE", # Composition_Exclusion just contributes to Full_Composition_Exclusion. 72 "JSN", 73 # These properties just don't seem useful. 74 # They are deprecated since Unicode 6.0. 75 "XO_NFC", 76 "XO_NFD", 77 "XO_NFKC", 78 "XO_NFKD", 79 # ICU does not use Unihan properties. 80 "cjkAccountingNumeric", 81 "cjkOtherNumeric", 82 "cjkPrimaryNumeric", 83 "cjkCompatibilityVariant", 84 "cjkIICore", 85 "cjkIRG_GSource", 86 "cjkIRG_HSource", 87 "cjkIRG_JSource", 88 "cjkIRG_KPSource", 89 "cjkIRG_KSource", 90 "cjkIRG_MSource", 91 "cjkIRG_SSource", 92 "cjkIRG_TSource", 93 "cjkIRG_UKSource", 94 "cjkIRG_USource", 95 "cjkIRG_VSource", 96 "cjkRSUnicode" 97)) 98 99# These properties (short names) map code points to 100# strings or other unusual values (property types String or Miscellaneous) 101# that cannot be block-compressed (or would be confusing). 102_uncompressible_props = frozenset(( 103 "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC", 104 "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF", 105 # scx is block-compressible. 106 "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc" 107)) 108 109# Dictionary of properties. 110# Keyed by normalized property names and aliases. 111# Each value is a tuple with 112# 0: Type of property (binary, enum, ...) 113# 1: List of aliases; short & long name followed by other aliases. 114# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt. 115# 2: Set of short property value names. 116# 3: Dictionary of property values. 117# For Catalog & Enumerated properties, 118# maps each value name to a list of aliases. 119# Empty for other types of properties. 120_properties = {} 121 122# Dictionary of binary-property values which we store as False/True. 123# Same as the values dictionary of one of the binary properties. 124_binary_values = {} 125 126# Dictionary of null values. 127# Keyed by short property names. 128# These are type-specific values for properties that occur in the data. 129# They are overridden by _defaults, block and code point properties. 130_null_values = {} 131 132# Property value names for null values. 133# We do not store these in _defaults. 134_null_names = frozenset(("<none>", "NaN")) 135 136# Dictionary of explicit default property values. 137# Keyed by short property names. 138_defaults = {"gc": "Cn"} 139 140# _null_values overridden by explicit _defaults. 141# Initialized after parsing is done. 142_null_or_defaults = {} 143 144# List of properties with an ICU UProperty enum. 145# Each item is an (enum, pname, values) tuple. 146# - enum: the ICU enum UProperty constant string 147# - pname: the UCD short property name 148# - values: list of (enum, vname) pairs per property value 149# - enum: the ICU property value's enum constant string 150# - vname: the UCD short property value name 151_icu_properties = [] 152 153# Dictionary of short property names mapped to _icu_properties items. 154_pname_to_icu_prop = {} 155 156_non_alnum_re = re.compile("[^a-zA-Z0-9]") 157 158def NormPropName(pname): 159 """Returns a normalized form of pname. 160 Removes non-ASCII-alphanumeric characters and lowercases letters.""" 161 return _non_alnum_re.sub("", pname).lower() 162 163 164def GetProperty(pname): 165 """Returns the _properties value for the pname. 166 Returns null if the property is ignored. 167 Caches alternate spellings of the property name.""" 168 # Try the input name. 169 prop = _properties.get(pname) 170 if prop != None: return prop 171 if pname in _ignored_properties: return None 172 # Try the normalized input name. 173 norm_name = NormPropName(pname) 174 prop = _properties.get(norm_name) 175 if prop != None: 176 _properties[pname] = prop # Cache prop under this new name spelling. 177 return prop 178 elif pname in _ignored_properties: 179 _ignored_properties.add(pname) # Remember to ignore this new name spelling. 180 return None 181 else: 182 raise NameError("unknown property %s\n" % pname) 183 184 185def GetShortPropertyName(pname): 186 if pname in _null_values: return pname # pname is already the short name. 187 prop = GetProperty(pname) 188 if not prop: return "" # For ignored properties. 189 return prop[1][0] or prop[1][1] # Long name if no short name. 190 191 192def GetShortPropertyValueName(prop, vname): 193 if vname in prop[2]: return vname 194 values = prop[3] 195 aliases = values.get(vname) 196 if aliases == None: 197 norm_name = NormPropName(vname) 198 aliases = values.get(norm_name) 199 if aliases == None: 200 raise NameError("unknown value name %s for property %s\n" % 201 (vname, prop[1][0])) 202 values[vname] = aliases 203 return aliases[0] or aliases[1] # Long name if no short name. 204 205 206def NormalizePropertyValue(prop, vname): 207 if prop[2]: # Binary/Catalog/Enumerated property. 208 value = GetShortPropertyValueName(prop, vname) 209 if prop[0] == "Binary": 210 value = value == "Y" 211 if prop[1][0].endswith("ccc"): 212 value = int(value) 213 else: 214 value = vname 215 return value 216 217# Character data ----------------------------------------------------------- *** 218 219# Lists of NamesList h1 and h2 headings. 220# Each h1 value is a (start, end, comment) tuple. 221# Each h2 value is a (cp, comment) tuple. 222_h1 = [] 223_h2 = [] 224 225# List of Unicode blocks. 226# Each item is a tuple of start & end code point integers 227# and a dictionary of default property values. 228_blocks = [] 229 230# List of ranges with algorithmic names. 231# Each value is a list of [start, end, type, prefix] 232# where prefix is optional. 233_alg_names_ranges = [] 234 235# List of Unicode character ranges and their properties, 236# stored as an inversion map with range_start & props dictionary. 237# Starts with one range for all of Unicode without any properties. 238# Setting values subdivides ranges. 239_starts = array.array('l', [0, 0x110000]) # array of int32_t 240_props = [{}, {}] # props for 0 and 110000 241 242def FindRange(x): 243 """ Binary search for x in the inversion map. 244 Returns the smallest i where x < _starts[i]""" 245 return bisect.bisect(_starts, x) - 1 246 247 248def GetProps(c): 249 i = FindRange(c) 250 return _props[i] 251 252 253def UpdateProps(start, end, update): 254 assert 0 <= start <= end <= 0x10ffff 255 (need_to_update, do_update, u) = (update[0], update[1], update[2]) 256 # Find the index i of the range in _starts that contains start. 257 i = FindRange(start) 258 limit = end + 1 259 # Intersect [start, limit[ with ranges in _starts. 260 c_start = _starts[i] 261 c_limit = _starts[i + 1] 262 c_props = _props[i] 263 # c_start <= start < c_limit 264 if c_start < start: 265 update_limit = c_limit if c_limit <= limit else limit 266 if need_to_update(u, start, update_limit - 1, c_props): 267 # Split off [c_start, start[ with a copy of c_props. 268 i += 1 269 c_props = c_props.copy() 270 _starts.insert(i, start) 271 _props.insert(i, c_props) 272 c_start = start 273 # Modify all ranges that are fully inside [start, limit[. 274 while c_limit <= limit: 275 # start <= c_start < c_limit <= limit 276 if need_to_update(u, c_start, c_limit - 1, c_props): 277 do_update(u, c_start, c_limit - 1, c_props) 278 if c_limit == 0x110000: return 279 i += 1 280 c_start = c_limit 281 c_limit = _starts[i + 1] 282 c_props = _props[i] 283 if c_start < limit and need_to_update(u, c_start, limit - 1, c_props): 284 # Split off [limit, c_limit[ with a copy of c_props. 285 _starts.insert(i + 1, limit) 286 _props.insert(i + 1, c_props.copy()) 287 # Modify [c_start, limit[ c_props. 288 do_update(u, c_start, limit - 1, c_props) 289 290 291def NeedToSetProps(props, start, end, c_props): 292 """Returns True if props is not a sub-dict of c_props.""" 293 for (pname, value) in props.items(): 294 if pname not in c_props or value != c_props[pname]: return True 295 return False 296 297 298def DoSetProps(props, start, end, c_props): 299 c_props.update(props) 300 301 302def SetProps(start, end, props): 303 UpdateProps(start, end, (NeedToSetProps, DoSetProps, props)) 304 305 306def NeedToSetAlways(nv, start, end, c_props): 307 return True 308 309 310# For restoring boundaries after merging adjacent same-props ranges. 311def AddBoundary(x): 312 """Ensure that there is a range start/limit at x.""" 313 assert 0 <= x <= 0x10ffff 314 i = FindRange(x) 315 if _starts[i] == x: return 316 # Split the range at x. 317 c_start = _starts[i] 318 c_limit = _starts[i + 1] 319 c_props = _props[i] 320 # c_start < x < c_limit 321 i += 1 322 _starts.insert(i, x) 323 _props.insert(i, c_props.copy()) 324 325 326def SetDefaultValue(pname, value): 327 """Sets the property's default value. Ignores null values.""" 328 prop = GetProperty(pname) 329 if prop and value not in _null_names: 330 value = NormalizePropertyValue(prop, value) 331 if value != _null_values[prop[1][0]]: 332 _defaults[prop[1][0]] = value 333 SetProps(0, 0x10ffff, {prop[1][0]: value}) 334 335 336def SetBinaryPropertyToTrue(pname, start, end): 337 prop = GetProperty(pname) 338 if prop: 339 assert prop[0] == "Binary" 340 SetProps(start, end, {prop[1][0]: True}) 341 342 343def SetPropValue(prop, vname, start, end): 344 value = NormalizePropertyValue(prop, vname) 345 SetProps(start, end, {prop[1][0]: value}) 346 347 348def SetPropertyValue(pname, vname, start, end): 349 prop = GetProperty(pname) 350 if prop: SetPropValue(prop, vname, start, end) 351 352# Parsing ------------------------------------------------------------------ *** 353 354_stripped_cp_re = re.compile("([0-9a-fA-F]+)$") 355_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$") 356_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$") 357 358def ReadUCDLines(in_file, want_ranges=True, want_other=False, 359 want_comments=False, want_missing=False): 360 """Parses lines from a semicolon-delimited UCD text file. 361 Strips comments, ignores empty and all-comment lines. 362 Returns a tuple (type, line, ...). 363 """ 364 for line in in_file: 365 line = line.strip() 366 if not line: continue 367 if line.startswith("#"): # whole-line comment 368 if want_missing: 369 match = _missing_re.match(line) 370 if match: 371 fields = match.group(1).split(";") 372 for i in range(len(fields)): fields[i] = fields[i].strip() 373 yield ("missing", line, fields) 374 continue 375 if want_comments: yield ("comment", line) 376 continue 377 comment_start = line.find("#") # inline comment 378 if comment_start >= 0: 379 line = line[:comment_start].rstrip() 380 if not line: continue 381 fields = line.split(";") 382 for i in range(len(fields)): fields[i] = fields[i].strip() 383 if want_ranges: 384 first = fields[0] 385 match = _stripped_range_re.match(first) 386 if match: 387 start = int(match.group(1), 16) 388 end = int(match.group(2), 16) 389 yield ("range", line, start, end, fields) 390 continue 391 match = _stripped_cp_re.match(first) 392 if match: 393 c = int(match.group(1), 16) 394 yield ("range", line, c, c, fields) 395 continue 396 if want_other: 397 yield ("other", line, fields) 398 else: 399 raise SyntaxError("unable to parse line\n %s\n" % line) 400 401 402def AddBinaryProperty(short_name, long_name): 403 _null_values[short_name] = False 404 bin_prop = _properties["Math"] 405 prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3]) 406 _properties[short_name] = prop 407 _properties[long_name] = prop 408 _properties[NormPropName(short_name)] = prop 409 _properties[NormPropName(long_name)] = prop 410 411 412def AddPOSIXBinaryProperty(name): 413 # We only define a long name for ICU-specific (non-UCD) POSIX properties. 414 _null_values[name] = False 415 bin_prop = _properties["Math"] 416 prop = ("Binary", ["", name], bin_prop[2], bin_prop[3]) 417 _properties[name] = prop 418 _properties[NormPropName(name)] = prop 419 # This is to match UProperty UCHAR_POSIX_ALNUM etc. 420 _properties["posix" + NormPropName(name)] = prop 421 422 423# Match a comment line like 424# PropertyAliases-6.1.0.txt 425# and extract the Unicode version. 426_ucd_version_re = re.compile("# *PropertyAliases" + 427 "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" + 428 "\\.txt") 429 430def ParsePropertyAliases(in_file): 431 global _ucd_version 432 prop_type_nulls = { 433 "Binary": False, 434 "Catalog": "??", # Must be specified, e.g., in @missing line. 435 "Enumerated": "??", # Must be specified. 436 "Numeric": "NaN", 437 "String": "", 438 "Miscellaneous": "" 439 } 440 for data in ReadUCDLines(in_file, want_ranges=False, 441 want_other=True, want_comments=True): 442 if data[0] == "comment": 443 line = data[1] 444 match = _ucd_version_re.match(line) 445 if match: 446 _ucd_version = match.group(1) 447 else: 448 words = line[1:].lstrip().split() 449 if len(words) == 2 and words[1] == "Properties": 450 prop_type = words[0] 451 null_value = prop_type_nulls[prop_type] 452 else: 453 # type == "other" 454 aliases = data[2] 455 name = aliases[0] 456 if name in _ignored_properties: 457 for alias in aliases: 458 _ignored_properties.add(alias) 459 _ignored_properties.add(NormPropName(alias)) 460 else: 461 if name.endswith("ccc"): 462 _null_values[name] = 0 463 else: 464 _null_values[name] = null_value 465 prop = (prop_type, aliases, set(), {}) 466 for alias in aliases: 467 _properties[alias] = prop 468 _properties[NormPropName(alias)] = prop 469 # Add provisional and ICU-specific properties we need. 470 # We add some in support of runtime API, even if we do not write 471 # data for them to ppucd.txt (e.g., lccc & tccc). 472 # We add others just to represent UCD data that contributes to 473 # some functionality, although Unicode has not "blessed" them 474 # as separate properties (e.g., Turkic_Case_Folding). 475 476 # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt. 477 name = "Turkic_Case_Folding" 478 _null_values[name] = "" 479 prop = ("String", [name, name], set(), {}) 480 _properties[name] = prop 481 _properties[NormPropName(name)] = prop 482 # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions. 483 name = "Conditional_Case_Mappings" 484 _null_values[name] = "" 485 prop = ("Miscellaneous", [name, name], set(), {}) 486 _properties[name] = prop 487 _properties[NormPropName(name)] = prop 488 # lccc = ccc of first cp in canonical decomposition. 489 _null_values["lccc"] = 0 490 ccc_prop = list(_properties["ccc"]) 491 ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"] 492 prop = tuple(ccc_prop) 493 _properties["lccc"] = prop 494 _properties["Lead_Canonical_Combining_Class"] = prop 495 _properties["leadcanonicalcombiningclass"] = prop 496 # tccc = ccc of last cp in canonical decomposition. 497 _null_values["tccc"] = 0 498 ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"] 499 prop = tuple(ccc_prop) 500 _properties["tccc"] = prop 501 _properties["Trail_Canonical_Combining_Class"] = prop 502 _properties["trailcanonicalcombiningclass"] = prop 503 # Script_Extensions 504 if "scx" not in _properties: 505 _null_values["scx"] = "" 506 prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {}) 507 _properties["scx"] = prop 508 _properties["Script_Extensions"] = prop 509 _properties["scriptextensions"] = prop 510 # General Category as a bit mask. 511 _null_values["gcm"] = "??" 512 gc_prop = _properties["gc"] 513 prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3]) 514 _properties["gcm"] = prop 515 _properties["General_Category_Mask"] = prop 516 _properties["generalcategorymask"] = prop 517 # Various binary properties. 518 AddBinaryProperty("Sensitive", "Case_Sensitive") 519 AddBinaryProperty("nfdinert", "NFD_Inert") 520 AddBinaryProperty("nfkdinert", "NFKD_Inert") 521 AddBinaryProperty("nfcinert", "NFC_Inert") 522 AddBinaryProperty("nfkcinert", "NFKC_Inert") 523 AddBinaryProperty("segstart", "Segment_Starter") 524 # http://www.unicode.org/reports/tr51/#Emoji_Properties 525 AddBinaryProperty("Emoji", "Emoji") 526 AddBinaryProperty("EPres", "Emoji_Presentation") 527 AddBinaryProperty("EMod", "Emoji_Modifier") 528 AddBinaryProperty("EBase", "Emoji_Modifier_Base") 529 AddBinaryProperty("EComp", "Emoji_Component") 530 AddBinaryProperty("ExtPict", "Extended_Pictographic") 531 # C/POSIX character classes that do not have Unicode property [value] aliases. 532 # See uchar.h. 533 AddPOSIXBinaryProperty("alnum") 534 AddPOSIXBinaryProperty("blank") 535 AddPOSIXBinaryProperty("graph") 536 AddPOSIXBinaryProperty("print") 537 AddPOSIXBinaryProperty("xdigit") 538 539 540def ParsePropertyValueAliases(in_file): 541 global _binary_values 542 for data in ReadUCDLines(in_file, want_ranges=False, 543 want_other=True, want_missing=True): 544 if data[0] == "missing": 545 SetDefaultValue(data[2][0], data[2][1]) 546 else: 547 # type == "other" 548 fields = data[2] 549 pname = fields[0] 550 prop = GetProperty(pname) 551 if prop: 552 del fields[0] # Only the list of aliases remains. 553 short_name = fields[0] 554 if short_name == "n/a": # no short name 555 fields[0] = "" 556 short_name = fields[1] 557 prop[2].add(short_name) 558 values = prop[3] 559 for alias in fields: 560 if alias: 561 values[alias] = fields 562 values[NormPropName(alias)] = fields 563 if prop[0] == "Binary" and not _binary_values: 564 _binary_values = values 565 # Some of the @missing lines with non-null default property values 566 # are in files that we do not parse; 567 # either because the data for that property is easily 568 # (i.e., the @missing line would be the only reason to parse such a file) 569 # or because we compute the property at runtime, 570 # such as the Hangul_Syllable_Type. 571 if "dt" not in _defaults: # DerivedDecompositionType.txt 572 _defaults["dt"] = "None" 573 if "nt" not in _defaults: # DerivedNumericType.txt 574 _defaults["nt"] = "None" 575 if "hst" not in _defaults: # HangulSyllableType.txt 576 _defaults["hst"] = "NA" 577 if "gc" not in _defaults: # No @missing line in any .txt file? 578 _defaults["gc"] = "Cn" 579 # Copy the gc default value to gcm. 580 _defaults["gcm"] = _defaults["gc"] 581 # Add ISO 15924-only script codes. 582 # Only for the ICU script code API, not necessary for parsing the UCD. 583 script_prop = _properties["sc"] 584 short_script_names = script_prop[2] # set 585 script_values = script_prop[3] # dict 586 remove_scripts = [] 587 for script in _scripts_only_in_iso15924: 588 if script in short_script_names: 589 remove_scripts.append(script) 590 else: 591 short_script_names.add(script) 592 # Do not invent a Unicode long script name before the UCD adds the script. 593 script_list = [script, script] # [short, long] 594 script_values[script] = script_list 595 # Probably not necessary because 596 # we will not parse these scripts from the UCD: 597 script_values[NormPropName(script)] = script_list 598 if remove_scripts: 599 raise ValueError( 600 "remove %s from _scripts_only_in_iso15924" % remove_scripts) 601 602 603def ParseBlocks(in_file): 604 for data in ReadUCDLines(in_file, want_missing=True): 605 if data[0] == "missing": 606 SetDefaultValue("blk", data[2][0]) 607 else: 608 # type == "range" 609 (start, end, name) = (data[2], data[3], data[4][1]) 610 _blocks.append((start, end, {"blk": name})) 611 SetPropertyValue("blk", name, start, end) 612 _blocks.sort() 613 # Check for overlapping blocks. 614 prev_end = -1 615 for b in _blocks: 616 start = b[0] 617 end = b[1] 618 if prev_end >= start: 619 raise ValueError( 620 "block %04lX..%04lX %s overlaps with another " + 621 "ending at %04lX\n %s\n" % 622 (start, end, b[2]["blk"], prev_end)) 623 prev_end = end 624 625 626def ParseUnicodeData(in_file): 627 dt_prop = GetProperty("dt") 628 range_first_line = "" 629 range_first = -1 630 for data in ReadUCDLines(in_file, want_missing=True): 631 # type == "range" 632 (line, c, end, fields) = (data[1], data[2], data[3], data[4]) 633 assert c == end 634 name = fields[1] 635 if name.startswith("<"): 636 if name.endswith(", First>"): 637 if range_first >= 0: 638 raise SyntaxError( 639 "error: unterminated range started at\n %s\n" % 640 range_first_line) 641 range_first = c 642 range_first_line = line 643 continue 644 elif name.endswith(", Last>"): 645 if range_first < 0: 646 raise SyntaxError( 647 "error: range end without start at\n %s\n" % 648 line) 649 elif range_first > c: 650 raise SyntaxError( 651 "error: range start/end out of order at\n %s\n %s\n" % 652 (range_first_line, line)) 653 first_name = range_first_line.split(";")[1][1:-8] 654 name = name[1:-7] 655 if first_name != name: 656 raise SyntaxError( 657 "error: range start/end name mismatch at\n %s\n %s\n" % 658 (range_first_line, line)) 659 end = c 660 c = range_first 661 range_first = -1 662 # Remember algorithmic name ranges. 663 if "Ideograph" in name: 664 prefix = "CJK UNIFIED IDEOGRAPH-" 665 if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-" 666 _alg_names_ranges.append([c, end, "han", prefix]) 667 elif name == "Hangul Syllable": 668 _alg_names_ranges.append([c, end, "hangul"]) 669 name = "" 670 else: 671 # Ignore non-names like <control>. 672 name = "" 673 props = {} 674 if name: props["na"] = name 675 props["gc"] = fields[2] 676 ccc = int(fields[3]) 677 if ccc: props["ccc"] = ccc 678 props["bc"] = fields[4] 679 # Decomposition type & mapping. 680 dm = fields[5] 681 if dm: 682 if dm.startswith("<"): 683 dt_limit = dm.index(">") 684 dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit]) 685 dm = dm[dt_limit + 1:].lstrip() 686 else: 687 dt = "Can" 688 props["dt"] = dt 689 props["dm"] = dm 690 # Numeric type & value. 691 decimal = fields[6] 692 digit = fields[7] 693 nv = fields[8] 694 if (decimal and decimal != nv) or (digit and digit != nv): 695 raise SyntaxError("error: numeric values differ at\n %s\n" % line) 696 if nv: 697 # Map improper fractions to proper ones. 698 # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS 699 # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS 700 if nv == "2/12": 701 nv = "1/6" 702 elif nv == "3/12": 703 nv = "1/4" 704 elif nv == "4/12": 705 nv = "1/3" 706 elif nv == "6/12": 707 nv = "1/2" 708 elif nv == "8/12": 709 nv = "2/3" 710 elif nv == "9/12": 711 nv = "3/4" 712 elif nv == "10/12": 713 nv = "5/6" 714 props["nv"] = nv 715 props["nt"] = "De" if decimal else "Di" if digit else "Nu" 716 if fields[9] == "Y": props["Bidi_M"] = True 717 # ICU 49 and above does not support Unicode_1_Name any more. 718 # See ticket #9013. 719 # na1 = fields[10] 720 # if na1: props["na1"] = na1 721 # ISO_Comment is deprecated and has no values. 722 # isc = fields[11] 723 # if isc: props["isc"] = isc 724 # Simple case mappings. 725 suc = fields[12] 726 slc = fields[13] 727 stc = fields[14] 728 if suc: props["suc"] = suc 729 if slc: props["slc"] = slc 730 if stc: props["stc"] = stc 731 SetProps(c, end, props) 732 if range_first >= 0: 733 raise SyntaxError( 734 "error: unterminated range started at\n %s\n" % 735 range_first_line) 736 # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt. 737 SetPropertyValue("dt", "Can", 0xac00, 0xd7a3) 738 _alg_names_ranges.sort() 739 740 741_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$") 742_names_h2_re = re.compile("@\t\t(.+)") 743_names_char_re = re.compile("([0-9a-fA-F]+)\t.+") 744 745def ParseNamesList(in_file): 746 pending_h2 = "" 747 for line in in_file: 748 line = line.strip() 749 if not line: continue 750 match = _names_h1_re.match(line) 751 if match: 752 pending_h2 = "" # Drop a pending h2 when we get to an h1. 753 start = int(match.group(1), 16) 754 end = int(match.group(3), 16) 755 comment = match.group(2).replace(u"\xa0", " ") 756 _h1.append((start, end, comment)) 757 continue 758 match = _names_h2_re.match(line) 759 if match: 760 pending_h2 = match.group(1).replace(u"\xa0", " ") 761 continue 762 if pending_h2: 763 match = _names_char_re.match(line) 764 if match: 765 c = int(match.group(1), 16) 766 _h2.append((c, pending_h2)) 767 pending_h2 = "" 768 _h1.sort() 769 _h2.sort() 770 771 772def ParseNamedProperties(in_file): 773 """Parses a .txt file where the first column is a code point range 774 and the second column is a property name. 775 Sets binary properties to True, 776 and other properties to the values in the third column.""" 777 for data in ReadUCDLines(in_file, want_missing=True): 778 if data[0] == "missing": 779 SetDefaultValue(data[2][0], data[2][1]) 780 else: 781 # type == "range" 782 if len(data[4]) == 2: 783 SetBinaryPropertyToTrue(data[4][1], data[2], data[3]) 784 else: 785 SetPropertyValue(data[4][1], data[4][2], data[2], data[3]) 786 787 788def ParseOneProperty(in_file, pname): 789 """Parses a .txt file where the first column is a code point range 790 and the second column is the value of a known property.""" 791 prop = GetProperty(pname) 792 for data in ReadUCDLines(in_file, want_missing=True): 793 if data[0] == "missing": 794 SetDefaultValue(pname, data[2][0]) 795 else: 796 # type == "range" 797 SetPropValue(prop, data[4][1], data[2], data[3]) 798 799 800def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg") 801def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age") 802def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc") 803def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg") 804def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt") 805def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea") 806def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB") 807def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC") 808def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC") 809def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb") 810def ParseScripts(in_file): ParseOneProperty(in_file, "sc") 811def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx") 812def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB") 813def ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo") 814def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB") 815 816 817def DoSetNameAlias(alias, start, end, c_props): 818 if "Name_Alias" in c_props: 819 c_props["Name_Alias"] += ',' + alias 820 else: 821 c_props["Name_Alias"] = alias 822 823 824def ParseNameAliases(in_file): 825 """Parses Name_Alias from NameAliases.txt. 826 A character can have multiple aliases. 827 828 In Unicode 6.0, there are two columns, 829 with a name correction in the second column. 830 831 In Unicode 6.1, there are three columns. 832 The second contains an alias, the third its type. 833 The documented types are: 834 correction, control, alternate, figment, abbreviation 835 836 This function does not sort the types, assuming they appear in this order.""" 837 for data in ReadUCDLines(in_file): 838 start = data[2] 839 end = data[3] 840 if start != end: 841 raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" % 842 (start, end)) 843 fields = data[4] 844 if len(fields) == 2: 845 alias = "correction=" + fields[1] 846 else: 847 alias = fields[2] + '=' + fields[1] 848 update = (NeedToSetAlways, DoSetNameAlias, alias) 849 UpdateProps(start, end, update) 850 851 852def NeedToSetNumericValue(nv, start, end, c_props): 853 c_nv = c_props.get("nv") 854 if c_nv == None: 855 # DerivedNumericValues.txt adds a Numeric_Value. 856 assert "nt" not in c_props 857 return True 858 if nv != c_nv: 859 raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " + 860 "but DerivedNumericValues.txt has nv=%s") % 861 (c_nv, start, end, nv)) 862 return False 863 864 865def DoSetNumericValue(nv, start, end, c_props): 866 c_props.update({"nt": "Nu", "nv": nv}) 867 868 869def ParseDerivedNumericValues(in_file): 870 """Parses DerivedNumericValues.txt. 871 For most characters, the numeric type & value were parsed previously 872 from UnicodeData.txt but that does not show the values for Han characters. 873 Here we check that values match those from UnicodeData.txt 874 and add new ones.""" 875 # Ignore the @missing line which has an incorrect number of fields, 876 # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1). 877 # Also, "NaN" is just the Numeric null value anyway. 878 for data in ReadUCDLines(in_file): 879 # Conditional update to the numeric value in the 4th field. 880 update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3]) 881 UpdateProps(data[2], data[3], update) 882 883 884def ParseCaseFolding(in_file): 885 for data in ReadUCDLines(in_file, want_missing=True): 886 if data[0] == "missing": 887 assert data[2][0] == "C" # common to scf & cf 888 SetDefaultValue("scf", data[2][1]) 889 SetDefaultValue("cf", data[2][1]) 890 else: 891 # type == "range" 892 start = data[2] 893 end = data[3] 894 status = data[4][1] 895 mapping = data[4][2] 896 assert status in "CSFT" 897 if status == "C": 898 SetProps(start, end, {"scf": mapping, "cf": mapping}) 899 elif status == "S": 900 SetPropertyValue("scf", mapping, start, end) 901 elif status == "F": 902 SetPropertyValue("cf", mapping, start, end) 903 else: # status == "T" 904 SetPropertyValue("Turkic_Case_Folding", mapping, start, end) 905 906 907def DoSetConditionalCaseMappings(ccm, start, end, c_props): 908 if "Conditional_Case_Mappings" in c_props: 909 c_props["Conditional_Case_Mappings"] += ',' + ccm 910 else: 911 c_props["Conditional_Case_Mappings"] = ccm 912 913 914def ParseSpecialCasing(in_file): 915 for data in ReadUCDLines(in_file, want_missing=True): 916 if data[0] == "missing": 917 SetDefaultValue("lc", data[2][0]) 918 SetDefaultValue("tc", data[2][1]) 919 SetDefaultValue("uc", data[2][2]) 920 else: 921 # type == "range" 922 start = data[2] 923 end = data[3] 924 fields = data[4] 925 if len(fields) < 5 or not fields[4]: 926 # Unconditional mappings. 927 SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]}) 928 else: 929 # Conditional_Case_Mappings 930 ccm = (fields[4] + ":lc=" + fields[1] + 931 "&tc=" + fields[2] + "&uc=" + fields[3]) 932 update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm) 933 UpdateProps(start, end, update) 934 935 936def ParseBidiBrackets(in_file): 937 for data in ReadUCDLines(in_file, want_missing=True): 938 if data[0] == "missing": 939 SetDefaultValue("bpt", data[2][1]) 940 else: 941 # type == "range" 942 start = data[2] 943 end = data[3] 944 assert start == end 945 mapping = data[4][1] 946 bracket_type = data[4][2] 947 SetProps(start, end, {"bpb": mapping, "bpt": bracket_type}) 948 949# Postprocessing ----------------------------------------------------------- *** 950 951def PrintedSize(pname, value): 952 if isinstance(value, bool): 953 if value: 954 return len(pname) + 1 # ";pname" 955 else: 956 return len(pname) + 2 # ";-pname" 957 else: 958 return len(pname) + len(str(value)) + 2 # ";pname=value" 959 960 961def CompactBlock(b, i): 962 assert b[0] == _starts[i] 963 b_props = b[2] # Normally just blk from Blocks.txt. 964 # b_props["blk"] has not been canonicalized yet. 965 b_props["blk"] = _props[i]["blk"] 966 orig_i = i 967 # Count the number of occurrences of each property's value in this block. 968 # To minimize the output, count the number of assigned ranges, 969 # not the number of code points. 970 num_ranges = 0 971 prop_counters = {} 972 if "gc" in b_props: 973 b_is_unassigned = b_props["gc"] == "Cn" # Unreachable with normal data. 974 else: 975 b_is_unassigned = _defaults["gc"] == "Cn" # This is expected to be true. 976 while True: 977 start = _starts[i] 978 if start > b[1]: break 979 props = _props[i] 980 if "gc" in props: 981 is_unassigned = props["gc"] == "Cn" 982 else: 983 is_unassigned = b_is_unassigned 984 if is_unassigned: 985 # Compact an unassigned range inside the block and 986 # mark it to be written with "unassigned". 987 # It falls back to default properties, not block properties, 988 # except for the blk=Block property. 989 assert props["blk"] == b_props["blk"] 990 del props["blk"] 991 for pname in list(props.keys()): # .keys() is a copy so we can del props[pname]. 992 if props[pname] == _null_or_defaults[pname]: del props[pname] 993 # What remains are unusual default values for unassigned code points. 994 # For example, bc=R or lb=ID. 995 # See http://www.unicode.org/reports/tr44/#Default_Values_Table 996 props["unassigned"] = True 997 else: 998 for (pname, value) in props.items(): 999 if pname in prop_counters: 1000 counter = prop_counters[pname] 1001 else: 1002 counter = {_null_or_defaults[pname]: num_ranges} 1003 prop_counters[pname] = counter 1004 if value in counter: 1005 counter[value] += 1 1006 else: 1007 counter[value] = 1 1008 # Also count default values for properties that do not occur in a range. 1009 for pname in prop_counters: 1010 if pname not in props: 1011 counter = prop_counters[pname] 1012 value = _null_or_defaults[pname] 1013 counter[value] += 1 1014 num_ranges += 1 1015 # Invariant: For each counter, the sum of counts must equal num_ranges. 1016 i += 1 1017 # For each property that occurs within this block, 1018 # set the value that reduces the file size the most as a block property value. 1019 # This is usually the most common value. 1020 for (pname, counter) in prop_counters.items(): 1021 default_value = _null_or_defaults[pname] 1022 default_size = PrintedSize(pname, default_value) * counter[default_value] 1023 max_value = None 1024 max_count = 0 1025 max_savings = 0 1026 for (value, count) in counter.items(): 1027 if value != default_value and count > 1: 1028 # Does the file get smaller by setting the block default? 1029 # We save writing the block value as often as it occurs, 1030 # minus once for writing it for the block, 1031 # minus writing the default value instead. 1032 savings = PrintedSize(pname, value) * (count - 1) - default_size 1033 if savings > max_savings: 1034 max_value = value 1035 max_count = count 1036 max_savings = savings 1037 # Do not compress uncompressible properties, 1038 # with an exception for many empty-string values in a block 1039 # (NFCK_CF='' for tags and variation selectors). 1040 if (max_savings > 0 and 1041 ((pname not in _uncompressible_props) or 1042 (max_value == '' and max_count >= 12))): 1043 b_props[pname] = max_value 1044 # For each range and property, remove the default+block value 1045 # but set the default value if that property was not set 1046 # (i.e., it used to inherit the default value). 1047 b_defaults = _null_or_defaults.copy() 1048 b_defaults.update(b_props) 1049 i = orig_i 1050 while True: 1051 start = _starts[i] 1052 if start > b[1]: break 1053 props = _props[i] 1054 if "unassigned" not in props: 1055 # Compact an assigned range inside the block. 1056 for pname in prop_counters: 1057 if pname in props: 1058 if props[pname] == b_defaults[pname]: del props[pname] 1059 elif pname in b_props: 1060 # b_props only has non-default values. 1061 # Set the default value if it used to be inherited. 1062 props[pname] = _null_or_defaults[pname] 1063 # If there is only one assigned range, then move all of its properties 1064 # to the block. 1065 if num_ranges == 1: 1066 b_props.update(props) 1067 props.clear() 1068 i += 1 1069 # Return the _starts index of the first range after this block. 1070 return i 1071 1072 1073def CompactNonBlock(limit, i): 1074 """Remove default property values from between-block ranges.""" 1075 default_is_unassigned = _defaults["gc"] == "Cn" # This is expected to be true. 1076 while True: 1077 start = _starts[i] 1078 if start >= limit: break 1079 props = _props[i] 1080 if "gc" in props: 1081 is_unassigned = props["gc"] == "Cn" 1082 else: 1083 is_unassigned = default_is_unassigned 1084 for pname in list(props.keys()): # .keys() is a copy so we can del props[pname]. 1085 if props[pname] == _null_or_defaults[pname]: del props[pname] 1086 assert "blk" not in props 1087 # If there are no props left, then nothing will be printed. 1088 # Otherwise, add "unassigned" for more obvious output. 1089 if props and is_unassigned: 1090 props["unassigned"] = True 1091 i += 1 1092 # Return the _starts index of the first range after this block. 1093 return i 1094 1095 1096def CompactBlocks(): 1097 """Optimizes block properties. 1098 Sets properties on blocks to the most commonly used values, 1099 and removes default+block values from code point properties.""" 1100 # Ensure that there is a boundary in _starts for each block 1101 # so that the simple mixing method below works. 1102 for b in _blocks: 1103 AddBoundary(b[0]) 1104 limit = b[1] + 1 1105 if limit <= 0x10ffff: AddBoundary(limit) 1106 # Walk through ranges and blocks together. 1107 i = 0 1108 for b in _blocks: 1109 b_start = b[0] 1110 if _starts[i] < b_start: 1111 i = CompactNonBlock(b_start, i) 1112 i = CompactBlock(b, i) 1113 CompactNonBlock(0x110000, i) 1114 1115# Output ------------------------------------------------------------------- *** 1116 1117def AppendRange(fields, start, end): 1118 if start == end: 1119 fields.append("%04lX" % start) 1120 else: 1121 fields.append("%04lX..%04lX" % (start, end)) 1122 1123 1124def AppendProps(fields, props): 1125 # Sort property names (props keys) by their normalized forms 1126 # and output properties in that order. 1127 for pname in sorted(props, key=NormPropName): 1128 value = props[pname] 1129 if isinstance(value, bool): 1130 if not value: pname = "-" + pname 1131 fields.append(pname) 1132 else: 1133 fields.append("%s=%s" % (pname, value)) 1134 1135 1136def WriteFieldsRangeProps(fields, start, end, props, out_file): 1137 AppendRange(fields, start, end) 1138 AppendProps(fields, props) 1139 out_file.write(";".join(fields)) 1140 out_file.write("\n") 1141 1142 1143def EscapeNonASCII(s): 1144 i = 0 1145 while i < len(s): 1146 c = ord(s[i]) 1147 if c <= 0x7f: 1148 i = i + 1 1149 else: 1150 if c <= 0xffff: 1151 esc = u"\\u%04X" % c 1152 else: 1153 esc = u"\\U%08X" % c 1154 s = s[:i] + esc + s[i+1:] 1155 i = i + len(esc) 1156 return s 1157 1158 1159def WritePreparsedUCD(out_file): 1160 out_file.write("""# Preparsed UCD generated by ICU preparseucd.py 1161# Copyright (C) 1991 and later: Unicode, Inc. and others. 1162# License & terms of use: http://www.unicode.org/copyright.html 1163"""); 1164 out_file.write("ucd;%s\n\n" % _ucd_version) 1165 # Sort property names (props keys) by their normalized forms 1166 # and output properties in that order. 1167 pnames = sorted(_null_values, key=NormPropName) 1168 for pname in pnames: 1169 prop = _properties[pname] 1170 out_file.write(";".join(["property", prop[0]] + prop[1])) 1171 out_file.write("\n") 1172 out_file.write("\n") 1173 out_file.write(";".join(["binary"] + _binary_values["N"])) 1174 out_file.write("\n") 1175 out_file.write(";".join(["binary"] + _binary_values["Y"])) 1176 out_file.write("\n") 1177 for pname in pnames: 1178 prop = _properties[pname] 1179 short_names = prop[2] 1180 if short_names and prop[0] != "Binary": 1181 for name in sorted(short_names): 1182 out_file.write(";".join(["value", prop[1][0]] + prop[3][name])) 1183 out_file.write("\n") 1184 out_file.write("\n") 1185 # Ensure that there is a boundary in _starts for each 1186 # range of data we mix into the output, 1187 # so that the simple mixing method below works. 1188 for b in _blocks: AddBoundary(b[0]) 1189 for r in _alg_names_ranges: AddBoundary(r[0]) 1190 for h in _h1: AddBoundary(h[0]) 1191 for h in _h2: AddBoundary(h[0]) 1192 # Write the preparsed data. ppucd.txt = preparsed UCD 1193 # Syntax: http://site.icu-project.org/design/props/ppucd 1194 WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file) 1195 i_blocks = 0 1196 i_alg = 0 1197 i_h1 = 0 1198 i_h2 = 0 1199 b_end = -1 1200 for i in range(len(_starts) - 1): 1201 start = _starts[i] 1202 end = _starts[i + 1] - 1 1203 # Block with default properties. 1204 if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]: 1205 b = _blocks[i_blocks] 1206 b_end = b[1] 1207 WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file) 1208 i_blocks += 1 1209 # NamesList h1 heading (for [most of] a block). 1210 if i_h1 < len(_h1) and start == _h1[i_h1][0]: 1211 h = _h1[i_h1] 1212 out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2]))) 1213 i_h1 += 1 1214 # Algorithmic-names range. 1215 if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]: 1216 r = _alg_names_ranges[i_alg] 1217 fields = ["algnamesrange"] 1218 AppendRange(fields, r[0], r[1]) 1219 fields.extend(r[2:]) 1220 out_file.write(";".join(fields)) 1221 out_file.write("\n") 1222 i_alg += 1 1223 # NamesList h2 heading. 1224 if i_h2 < len(_h2) and start == _h2[i_h2][0]: 1225 out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1])) 1226 i_h2 += 1 1227 # Code point/range data. 1228 props = _props[i] 1229 # Omit ranges with only default+block properties. 1230 if props: 1231 if start > b_end and b_end >= 0: 1232 # First range with values after the last block. 1233 # Separate it visually from the block lines. 1234 out_file.write("\n# No block\n") 1235 b_end = -1 1236 if "unassigned" in props: 1237 # Do not output "unassigned" as a property. 1238 del props["unassigned"] 1239 line_type = "unassigned" 1240 else: 1241 line_type = "cp" 1242 WriteFieldsRangeProps([line_type], start, end, props, out_file) 1243 1244# Write Normalizer2 input files -------------------------------------------- *** 1245# Ported from gennorm/store.c. 1246 1247def WriteAllCC(out_file): 1248 out_file.write("# Canonical_Combining_Class (ccc) values\n"); 1249 prev_start = 0 1250 prev_cc = 0 1251 for i in range(len(_starts)): 1252 start = _starts[i] 1253 props = _props[i] 1254 cc = props.get("ccc") 1255 if not cc: cc = 0 1256 if prev_cc != cc: 1257 if prev_cc != 0: 1258 last_code_point = start - 1 1259 if prev_start == last_code_point: 1260 out_file.write("%04X:%d\n" % (last_code_point, prev_cc)) 1261 else: 1262 out_file.write("%04X..%04X:%d\n" % 1263 (prev_start, last_code_point, prev_cc)) 1264 prev_start = start 1265 prev_cc = cc 1266 1267 1268def HasMapping(c): 1269 props = GetProps(c) 1270 dt = props.get("dt") 1271 return dt and dt != "None" 1272 1273 1274def HasOneWayMapping(c): 1275 while True: 1276 props = GetProps(c) 1277 dt = props.get("dt") 1278 if not dt or dt == "None": 1279 return False # no mapping 1280 elif dt == "Can": 1281 # The canonical decomposition is a one-way mapping if 1282 # - it does not map to exactly two code points 1283 # - c has ccc!=0 1284 # - c has the Composition_Exclusion property 1285 # - its starter has a one-way mapping (loop for this) 1286 # - its non-starter decomposes 1287 nfd = props["dm"].split() 1288 if (len(nfd) != 2 or 1289 props.get("ccc") or 1290 props.get("Comp_Ex") or 1291 HasMapping(int(nfd[1], 16))): 1292 return True 1293 c = int(nfd[0], 16) # continue 1294 else: 1295 # c has a compatibility mapping. 1296 return True 1297 1298 1299_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others. 1300# License & terms of use: http://www.unicode.org/copyright.html 1301# Copyright (C) 1999-2016, International Business Machines 1302# Corporation and others. All Rights Reserved. 1303# 1304""" 1305 1306def WriteNorm2NFCTextFile(path): 1307 global _data_file_copyright 1308 with open(os.path.join(path, "nfc.txt"), "w") as out_file: 1309 out_file.write( 1310 _data_file_copyright + """# file name: nfc.txt 1311# 1312# machine-generated by ICU preparseucd.py 1313# 1314# Complete data for Unicode NFC normalization. 1315 1316* Unicode """ + _ucd_version + """ 1317 1318""") 1319 WriteAllCC(out_file) 1320 out_file.write("\n# Canonical decomposition mappings\n") 1321 for i in range(len(_starts) - 1): 1322 start = _starts[i] 1323 end = _starts[i + 1] - 1 1324 props = _props[i] 1325 dm = props.get("dm") 1326 if dm and dm[0] != '<' and props["dt"] == "Can": 1327 assert start == end 1328 # The Comp_Ex=Full_Composition_Exclusion property tells us 1329 # whether the canonical decomposition round-trips. 1330 separator = '>' if props.get("Comp_Ex") else '=' 1331 out_file.write("%04X%s%s\n" % (start, separator, dm)) 1332 1333 1334def WriteNorm2NFKCTextFile(path): 1335 global _data_file_copyright 1336 with open(os.path.join(path, "nfkc.txt"), "w") as out_file: 1337 out_file.write( 1338 _data_file_copyright + """# file name: nfkc.txt 1339# 1340# machine-generated by ICU preparseucd.py 1341# 1342# Data for Unicode NFKC normalization. 1343# This file contains only compatibility decomposition mappings, 1344# plus those canonical decompositions that change from NFC round-trip mappings 1345# to NFKC one-way mappings. 1346# Use this file as the second gennorm2 input file after nfc.txt. 1347 1348* Unicode """ + _ucd_version + """ 1349 1350""") 1351 for i in range(len(_starts) - 1): 1352 start = _starts[i] 1353 end = _starts[i + 1] - 1 1354 props = _props[i] 1355 dm = props.get("dm") 1356 if dm and dm[0] != '<': 1357 assert start == end 1358 if props["dt"] != "Can": 1359 # Compatibility decomposition. 1360 out_file.write("%04X>%s\n" % (start, dm)) 1361 elif not props.get("Comp_Ex") and HasOneWayMapping(start): 1362 # NFC round-trip mapping turns into NFKC one-way mapping. 1363 out_file.write("%04X>%s # NFC round-trip, NFKC one-way\n" % 1364 (start, dm)) 1365 1366 1367def WriteNorm2NFKC_CFTextFile(path): 1368 global _data_file_copyright 1369 with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file: 1370 out_file.write( 1371 _data_file_copyright + """# file name: nfkc_cf.txt 1372# 1373# machine-generated by ICU preparseucd.py 1374# 1375# This file contains the Unicode NFKC_CF mappings, 1376# extracted from the UCD file DerivedNormalizationProps.txt, 1377# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool. 1378# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt. 1379 1380""") 1381 out_file.write("* Unicode " + _ucd_version + "\n\n") 1382 prev_start = 0 1383 prev_end = 0 1384 prev_nfkc_cf = None 1385 for i in range(len(_starts) - 1): 1386 start = _starts[i] 1387 end = _starts[i + 1] - 1 1388 props = _props[i] 1389 nfkc_cf = props.get("NFKC_CF") 1390 # Merge with the previous range if possible, 1391 # or remember this range for merging. 1392 if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start: 1393 prev_end = end 1394 else: 1395 if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'): 1396 if prev_start == prev_end: 1397 out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf)) 1398 else: 1399 out_file.write("%04X..%04X>%s\n" % 1400 (prev_start, prev_end, prev_nfkc_cf)) 1401 prev_start = start 1402 prev_end = end 1403 prev_nfkc_cf = nfkc_cf 1404 1405 1406def WriteNorm2(path): 1407 WriteNorm2NFCTextFile(path) 1408 WriteNorm2NFKCTextFile(path) 1409 WriteNorm2NFKC_CFTextFile(path) 1410 1411# UTS #46 Normalizer2 input file ------------------------------------------- *** 1412 1413_idna_replacements = [ 1414 # Several versions of avoiding circular FFFD>FFFD mappings, 1415 # depending on the version of the input file. 1416 (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"), 1417 (re.compile(r"\.\.FFFD"), "..FFFC"), 1418 (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), 1419 # Since we switch between checking and not checking for STD3 character 1420 # restrictions at runtime, checking the non-LDH ASCII characters in code, 1421 # we treat these values here like their regular siblings. 1422 (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"), 1423 (re.compile(r"; disallowed_STD3_mapped +; "), ">"), 1424 # For UTS #46, we do not care about "not valid in IDNA2008". 1425 (re.compile(r"; *; NV8 +"), ""), 1426 # ICU 63+ normalization no longer allows mappings for surrogate code points, 1427 # and the UTS #46 code handles them instead. 1428 (re.compile(r"^D800..DFFF ; disallowed"), r"# D800..DFFF disallowed in code"), 1429 # Normal transformations. 1430 (re.compile(r"; disallowed"), ">FFFD"), 1431 (re.compile(r"; ignored"), ">"), 1432 (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), 1433 (re.compile(r"; mapped +; "), ">"), 1434 (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >") 1435] 1436 1437def IdnaToUTS46TextFile(s, t): 1438 """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format.""" 1439 # Different input/output file names. 1440 dest_path = os.path.dirname(t) 1441 t = os.path.join(dest_path, "uts46.txt") 1442 # TODO: With Python 2.7+, combine the two with statements into one. 1443 with open(s, "r") as in_file: 1444 with open(t, "w") as out_file: 1445 out_file.write("# Original file:\n") 1446 for line in in_file: 1447 orig_line = line 1448 if line.startswith("# For documentation"): 1449 out_file.write(line) 1450 out_file.write(r""" 1451# ================================================ 1452# This file has been reformatted into syntax for the 1453# gennorm2 Normalizer2 data generator tool. 1454# 1455# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out. 1456# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax. 1457# "disallowed" lines map to U+FFFD. 1458# "ignored" lines map to an empty string. 1459# 1460# Characters disallowed under STD3 rules are treated as valid or mapped; 1461# they are handled in code. 1462# Deviation characters are also handled in code. 1463# 1464# Use this file as the second gennorm2 input file after nfc.txt. 1465# ================================================ 1466""") 1467 continue 1468 if line[0] in "#\r\n": 1469 out_file.write(line) 1470 continue 1471 for rep in _idna_replacements: line = rep[0].sub(rep[1], line) 1472 # Align inline comments at column 40. 1473 comment_pos = line.find("#", 1) 1474 if comment_pos < 40: 1475 line = (line[:comment_pos] + ((40 - comment_pos) * ' ') + 1476 line[comment_pos:]) 1477 elif comment_pos > 40: 1478 space_pos = comment_pos 1479 while space_pos > 0 and line[space_pos - 1] == ' ': 1480 space_pos = space_pos - 1 1481 if space_pos < 40: 1482 # Fewer than 40 characters before the comment: 1483 # Align comments at column 40. 1484 line = line[:40] + line[comment_pos:] 1485 else: 1486 # 40 or more characters before the comment: 1487 # Keep one space between contents and comment. 1488 line = line[:space_pos] + " " + line[comment_pos:] 1489 # Write the modified line. 1490 out_file.write(line) 1491 if "..FFFF" in orig_line and "..FFFC" in line: 1492 out_file.write("FFFE..FFFF >FFFD\n"); 1493 return t 1494 1495# Preprocessing ------------------------------------------------------------ *** 1496 1497_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*") 1498_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 1499 1500def CopyAndStripWithOptionalMerge(s, t, do_merge): 1501 # TODO: We do not seem to need the do_merge argument and logic any more. 1502 with open(s, "r") as in_file, open(t, "w") as out_file: 1503 first = -1 # First code point with first_data. 1504 last = -1 # Last code point with first_data. 1505 first_data = "" # Common data for code points [first..last]. 1506 for line in in_file: 1507 match = _strip_re.match(line) 1508 if match: 1509 line = match.group(1) 1510 else: 1511 line = line.rstrip() 1512 if do_merge: 1513 match = _code_point_re.match(line) 1514 if match: 1515 c = int(match.group(1), 16) 1516 data = line[match.end() - 1:] 1517 else: 1518 c = -1 1519 data = "" 1520 if last >= 0 and (c != (last + 1) or data != first_data): 1521 # output the current range 1522 if first == last: 1523 out_file.write("%04X%s\n" % (first, first_data)) 1524 else: 1525 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 1526 first = -1 1527 last = -1 1528 first_data = "" 1529 if c < 0: 1530 # no data on this line, output as is 1531 out_file.write(line) 1532 out_file.write("\n") 1533 else: 1534 # data on this line, store for possible range compaction 1535 if last < 0: 1536 # set as the first line in a possible range 1537 first = c 1538 last = c 1539 first_data = data 1540 else: 1541 # must be c == (last + 1) and data == first_data 1542 # because of previous conditions 1543 # continue with the current range 1544 last = c 1545 else: 1546 # Only strip, don't merge: just output the stripped line. 1547 out_file.write(line) 1548 out_file.write("\n") 1549 if do_merge and last >= 0: 1550 # output the last range in the file 1551 if first == last: 1552 out_file.write("%04X%s\n" % (first, first_data)) 1553 else: 1554 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 1555 first = -1 1556 last = -1 1557 first_data = "" 1558 out_file.flush() 1559 return t 1560 1561 1562def CopyAndStrip(s, t): 1563 """Copies a file and removes comments behind data lines but not in others.""" 1564 return CopyAndStripWithOptionalMerge(s, t, False) 1565 1566 1567def CopyAndStripAndMerge(s, t): 1568 """Copies and strips a file and merges lines. 1569 1570 Copies a file, removes comments, and 1571 merges lines with adjacent code point ranges and identical per-code point 1572 data lines into one line with range syntax. 1573 """ 1574 return CopyAndStripWithOptionalMerge(s, t, True) 1575 1576 1577def CopyOnly(s, t): 1578 shutil.copy(s, t) 1579 return t 1580 1581 1582def DontCopy(s, t): 1583 return s 1584 1585 1586# Each _files value is a 1587# (preprocessor, dest_folder, parser, order) tuple 1588# where all fields except the preprocessor are optional. 1589# After the initial preprocessing (copy/strip/merge), 1590# if a parser is specified, then a tuple is added to _files_to_parse 1591# at index "order" (default order 9). 1592# An explicit order number is set only for files that must be parsed 1593# before others. 1594_files = { 1595 "BidiBrackets.txt": (DontCopy, ParseBidiBrackets), 1596 "BidiMirroring.txt": (DontCopy, ParseBidiMirroring), 1597 "BidiTest.txt": (CopyOnly, "testdata"), 1598 "Blocks.txt": (DontCopy, ParseBlocks), 1599 "CaseFolding.txt": (CopyOnly, ParseCaseFolding), 1600 "DerivedAge.txt": (DontCopy, ParseDerivedAge), 1601 "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass), 1602 "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties), 1603 "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup), 1604 "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType), 1605 "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties), 1606 "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues), 1607 "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth), 1608 "emoji-data.txt": (DontCopy, ParseNamedProperties), 1609 "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty), 1610 "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"), 1611 "IdnaTestV2.txt": (CopyOnly, "testdata"), 1612 "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory), 1613 "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory), 1614 "LineBreak.txt": (DontCopy, ParseLineBreak), 1615 "LineBreakTest.txt": (CopyOnly, "testdata"), 1616 "NameAliases.txt": (DontCopy, ParseNameAliases), 1617 "NamesList.txt": (DontCopy, ParseNamesList), 1618 "NormalizationCorrections.txt": (CopyOnly,), # Only used in gensprep. 1619 "NormalizationTest.txt": (CopyAndStrip,), 1620 "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0), 1621 "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1), 1622 "PropList.txt": (DontCopy, ParseNamedProperties), 1623 "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak), 1624 "SentenceBreakTest.txt": (CopyOnly, "testdata"), 1625 "Scripts.txt": (DontCopy, ParseScripts), 1626 "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions), 1627 "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing), 1628 "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2), 1629 "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation), 1630 "WordBreakProperty.txt": (DontCopy, ParseWordBreak), 1631 "WordBreakTest.txt": (CopyOnly, "testdata"), 1632 # From www.unicode.org/Public/idna/<version>/ 1633 "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2") 1634} 1635 1636# List of lists of files to be parsed in order. 1637# Inner lists contain (basename, path, parser) tuples. 1638_files_to_parse = [[], [], [], [], [], [], [], [], [], []] 1639 1640# Get the standard basename from a versioned filename. 1641# For example, match "UnicodeData-6.1.0d8.txt" 1642# so we can turn it into "UnicodeData.txt". 1643_file_version_re = re.compile("([a-zA-Z0-9_-]+)" + 1644 "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" + 1645 "(\\.[a-z]+)$") 1646 1647def PreprocessFiles(source_files, icu4c_src_root): 1648 unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") 1649 norm2_path = os.path.join(unidata_path, "norm2") 1650 testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata") 1651 folder_to_path = { 1652 "unidata": unidata_path, 1653 "norm2": norm2_path, 1654 "testdata": testdata_path 1655 } 1656 files_processed = set() 1657 for source_file in source_files: 1658 (folder, basename) = os.path.split(source_file) 1659 match = _file_version_re.match(basename) 1660 if match: 1661 new_basename = match.group(1) + match.group(2) 1662 if new_basename != basename: 1663 print("Removing version suffix from " + source_file) 1664 # ... so that we can easily compare UCD files. 1665 new_source_file = os.path.join(folder, new_basename) 1666 shutil.move(source_file, new_source_file) 1667 basename = new_basename 1668 source_file = new_source_file 1669 if basename in _files: 1670 print("Preprocessing %s" % basename) 1671 if basename in files_processed: 1672 raise Exception("duplicate file basename %s!" % basename) 1673 files_processed.add(basename) 1674 value = _files[basename] 1675 preprocessor = value[0] 1676 if len(value) >= 2 and isinstance(value[1], (str)): 1677 # The value was [preprocessor, dest_folder, ...], leave [...]. 1678 dest_folder = value[1] 1679 value = value[2:] 1680 else: 1681 # The value was [preprocessor, ...], leave [...]. 1682 dest_folder = "unidata" 1683 value = value[1:] 1684 dest_path = folder_to_path[dest_folder] 1685 if not os.path.exists(dest_path): os.makedirs(dest_path) 1686 dest_basename = basename 1687 # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt. 1688 if basename.endswith("-cldr.txt"): 1689 dest_basename = basename[:-9] + basename[-4:] 1690 dest_file = os.path.join(dest_path, dest_basename) 1691 parse_file = preprocessor(source_file, dest_file) 1692 if value: 1693 order = 9 if len(value) < 2 else value[1] 1694 _files_to_parse[order].append((basename, parse_file, value[0])) 1695 1696# Character names ---------------------------------------------------------- *** 1697 1698# TODO: Turn this script into a module that 1699# a) gives access to the parsed data 1700# b) has a PreparseUCD(ucd_root, icu4c_src_root) function 1701# c) has a ParsePreparsedUCD(filename) function 1702# d) has a WritePreparsedUCD(filename) function 1703# and then use it from a new script for names. 1704# Some more API: 1705# - generator GetRangesAndProps() -> (start, end, props)* 1706 1707def IncCounter(counters, key, inc=1): 1708 if key in counters: 1709 counters[key] += inc 1710 else: 1711 counters[key] = inc 1712 1713 1714endings = ( 1715 # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz. 1716 "PHASE-", 1717 "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ", 1718 "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ", 1719 "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ", 1720 "ACROPHONIC ", "HIEROGLYPH ", 1721 "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ", 1722 "PUNCTUATION ", "SIGN ", "SYMBOL ", 1723 "TILE ", "CARD ", "FACE ", 1724 "ACCENT ", "POINT ", 1725 # List SIGN before VOWEL to catch "vowel sign". 1726 "VOWEL ", "TONE ", "RADICAL ", 1727 # For names of math symbols, 1728 # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A 1729 "SCRIPT ", "FRAKTUR ", "MONOSPACE ", 1730 "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ", 1731 "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ", 1732 # BRAILLE PATTERN DOTS-xyz 1733 "DOTS-", 1734 "SELECTOR ", "SELECTOR-" 1735) 1736 1737def SplitName(name, tokens): 1738 start = 0 1739 for e in endings: 1740 i = name.find(e) 1741 if i >= 0: 1742 start = i + len(e) 1743 token = name[:start] 1744 IncCounter(tokens, token) 1745 break 1746 for i in range(start, len(name)): 1747 c = name[i] 1748 if c == ' ' or c == '-': 1749 token = name[start:i + 1] 1750 IncCounter(tokens, token) 1751 start = i + 1 1752 IncCounter(tokens, name[start:]) 1753 1754 1755def PrintNameStats(): 1756 # TODO: This name analysis code is out of date. 1757 # It needs to consider the multi-type Name_Alias values. 1758 name_pnames = ("na", "na1", "Name_Alias") 1759 counts = {} 1760 for pname in name_pnames: 1761 counts[pname] = 0 1762 total_lengths = counts.copy() 1763 max_length = 0 1764 max_per_cp = 0 1765 name_chars = set() 1766 num_digits = 0 1767 token_counters = {} 1768 char_counters = {} 1769 for i in range(len(_starts) - 1): 1770 start = _starts[i] 1771 # end = _starts[i + 1] - 1 1772 props = _props[i] 1773 per_cp = 0 1774 for pname in name_pnames: 1775 if pname in props: 1776 counts[pname] += 1 1777 name = props[pname] 1778 total_lengths[pname] += len(name) 1779 name_chars |= set(name) 1780 if len(name) > max_length: max_length = len(name) 1781 per_cp += len(name) + 1 1782 if per_cp > max_per_cp: max_per_cp = per_cp 1783 tokens = SplitName(name, token_counters) 1784 for c in name: 1785 if c in "0123456789": num_digits += 1 1786 IncCounter(char_counters, c) 1787 print 1788 for pname in name_pnames: 1789 print("'%s' character names: %d / %d bytes" % 1790 (pname, counts[pname], total_lengths[pname])) 1791 print("%d total bytes in character names" % sum(total_lengths.itervalues())) 1792 print("%d name-characters: %s" % 1793 (len(name_chars), "".join(sorted(name_chars)))) 1794 print("%d digits 0-9" % num_digits) 1795 count_chars = [(count, c) for (c, count) in char_counters.items()] 1796 count_chars.sort(reverse=True) 1797 for cc in count_chars: 1798 print("name-chars: %6d * '%s'" % cc) 1799 print("max. name length: %d" % max_length) 1800 print("max. length of all (names+NUL) per cp: %d" % max_per_cp) 1801 1802 token_lengths = sum([len(t) + 1 for t in token_counters]) 1803 print("%d total tokens, %d bytes with NUL" % 1804 (len(token_counters), token_lengths)) 1805 1806 counts_tokens = [] 1807 for (token, count) in token_counters.items(): 1808 # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time 1809 # but have to store the token string itself with a length or terminator byte, 1810 # plus a 2-byte entry in an token index table. 1811 savings = count * (len(token) - 1) - (len(token) + 1 + 2) 1812 if savings > 0: 1813 counts_tokens.append((savings, count, token)) 1814 counts_tokens.sort(reverse=True) 1815 print("%d tokens might save space with 1-byte codes" % len(counts_tokens)) 1816 1817 # Codes=bytes, 40 byte values for name_chars. 1818 # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens. 1819 # Make each 2-byte token the token string index itself, rather than 1820 # and index into a string index table. 1821 # More lead bytes but also more savings. 1822 num_units = 256 1823 max_lead = (token_lengths + 255) / 256 1824 max_token_units = num_units - len(name_chars) 1825 results = [] 1826 for num_lead in range(min(max_lead, max_token_units) + 1): 1827 max1 = max_token_units - num_lead 1828 ct = counts_tokens[:max1] 1829 tokens1 = set([t for (s, c, t) in ct]) 1830 for (token, count) in token_counters.items(): 1831 if token in tokens1: continue 1832 # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time 1833 # but have to store the token string itself with a length or terminator byte. 1834 savings = count * (len(token) - 2) - (len(token) + 1) 1835 if savings > 0: 1836 ct.append((savings, count, token)) 1837 ct.sort(reverse=True) 1838 # A 2-byte-code-token index cannot be limit_t_lengths or higher. 1839 limit_t_lengths = num_lead * 256 1840 token2_index = 0 1841 for i in range(max1, len(ct)): 1842 if token2_index >= limit_t_lengths: 1843 del ct[i:] 1844 break 1845 token2_index += len(ct[i][2]) + 1 1846 cumul_savings = sum([s for (s, c, t) in ct]) 1847 # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" % 1848 # (max1, len(ct), cumul_savings)) 1849 results.append((cumul_savings, max1, ct)) 1850 best = max(results) # (cumul_savings, max1, ct) 1851 1852 max1 = best[1] 1853 print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" % 1854 (best[0], max1, max_token_units - max1)) 1855 counts_tokens = best[2] 1856 cumul_savings = 0 1857 for i in range(len(counts_tokens)): 1858 n = 1 if i < max1 else 2 1859 i1 = i + 1 1860 t = counts_tokens[i] 1861 cumul_savings += t[0] 1862 if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens): 1863 print(("%04d. cumul. %6d bytes save %6d bytes from " + 1864 "%5d * %d-byte token for %2d='%s'") % 1865 (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2])) 1866 1867# ICU API ------------------------------------------------------------------ *** 1868 1869# Sample line to match: 1870# UCHAR_UNIFIED_IDEOGRAPH=29, 1871_uchar_re = re.compile( 1872 " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),") 1873 1874# Sample line to match: 1875# /** Zs @stable ICU 2.0 */ 1876_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ") 1877 1878# Sample line to match: 1879# U_SPACE_SEPARATOR = 12, 1880_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") 1881 1882# Sample line to match: 1883# /** L @stable ICU 2.0 */ 1884_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ") 1885 1886# Sample line to match: 1887# U_LEFT_TO_RIGHT = 0, 1888_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") 1889 1890# Sample line to match: 1891# UBLOCK_CYRILLIC =9, 1892_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,") 1893 1894# Sample line to match: 1895# U_EA_AMBIGUOUS, 1896_prop_and_value_re = re.compile( 1897 " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))") 1898 1899# Sample line to match if it has matched _prop_and_value_re 1900# (we want to exclude aliases): 1901# U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, 1902_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U") 1903 1904def ParseUCharHeader(icu4c_src_root): 1905 uchar_path = os.path.join(icu4c_src_root, "source", 1906 "common", "unicode", "uchar.h") 1907 with open(uchar_path, "r") as uchar_file: 1908 mode = "" # Mode string (=pname) during context-sensitive parsing. 1909 comment_value = "" # Property value from a comment preceding an enum. 1910 # Note: The enum UProperty is first in uchar.h, before the enums for values. 1911 for line in uchar_file: 1912 # Parse some enums via context-sensitive "modes". 1913 # Necessary because the enum constant names do not contain 1914 # enough information. 1915 if "enum UCharCategory" in line: 1916 mode = "gc" 1917 comment_value = "" 1918 continue 1919 if mode == "gc": 1920 if line.startswith("}"): 1921 mode = "" 1922 continue 1923 match = _gc_comment_re.match(line) 1924 if match: 1925 comment_value = match.group(1) 1926 continue 1927 match = _gc_re.match(line) 1928 if match and comment_value: 1929 gc_enum = match.group(1) 1930 prop = _properties["gc"] 1931 vname = GetShortPropertyValueName(prop, comment_value) 1932 icu_values = _pname_to_icu_prop["gc"][2] 1933 icu_values.append((gc_enum, vname)) 1934 comment_value = "" 1935 continue 1936 if "enum UCharDirection {" in line: 1937 mode = "bc" 1938 comment_value = "" 1939 continue 1940 if mode == "bc": 1941 if line.startswith("}"): 1942 mode = "" 1943 continue 1944 match = _bc_comment_re.match(line) 1945 if match: 1946 comment_value = match.group(1) 1947 continue 1948 match = _bc_re.match(line) 1949 if match and comment_value: 1950 bc_enum = match.group(1) 1951 prop = _properties["bc"] 1952 vname = GetShortPropertyValueName(prop, comment_value) 1953 icu_values = _pname_to_icu_prop["bc"][2] 1954 icu_values.append((bc_enum, vname)) 1955 comment_value = "" 1956 continue 1957 # No mode, parse enum constants whose names contain 1958 # enough information to parse without requiring context. 1959 match = _uchar_re.match(line) 1960 if match: 1961 prop_enum = match.group(1) 1962 if prop_enum.endswith("_LIMIT"): 1963 # Ignore "UCHAR_BINARY_LIMIT=57," etc. 1964 continue 1965 pname = GetShortPropertyName(prop_enum[6:]) 1966 icu_prop = (prop_enum, pname, []) 1967 _icu_properties.append(icu_prop) 1968 _pname_to_icu_prop[pname] = icu_prop 1969 continue 1970 match = _ublock_re.match(line) 1971 if match: 1972 prop_enum = match.group(1) 1973 if prop_enum == "UBLOCK_COUNT": 1974 continue 1975 prop = _properties["blk"] 1976 vname = GetShortPropertyValueName(prop, prop_enum[7:]) 1977 icu_values = _pname_to_icu_prop["blk"][2] 1978 icu_values.append((prop_enum, vname)) 1979 continue 1980 match = _prop_and_value_re.match(line) 1981 if match: 1982 (prop_enum, vname) = match.group(1, 3) 1983 if vname == "COUNT" or _prop_and_alias_re.match(line): 1984 continue 1985 pname = GetShortPropertyName(match.group(2)) 1986 prop = _properties[pname] 1987 vname = GetShortPropertyValueName(prop, vname) 1988 icu_values = _pname_to_icu_prop[pname][2] 1989 icu_values.append((prop_enum, vname)) 1990 # ccc, lccc, tccc use their numeric values as "enum" values. 1991 # In the UCD data, these numeric values are the first value names, 1992 # followed by the short & long value names. 1993 # List the ccc values in numeric order. 1994 prop = _properties["ccc"] 1995 icu_values = _pname_to_icu_prop["ccc"][2] 1996 for ccc in sorted([int(name) for name in prop[2]]): 1997 icu_values.append((ccc, str(ccc))) 1998 _pname_to_icu_prop["lccc"][2].extend(icu_values) # Copy ccc -> lccc. 1999 _pname_to_icu_prop["tccc"][2].extend(icu_values) # Copy ccc -> tccc. 2000 2001 # No need to parse predictable General_Category_Mask enum constants. 2002 # Just define them in ASCII order. 2003 prop = _properties["gcm"] 2004 icu_values = _pname_to_icu_prop["gcm"][2] 2005 for vname in sorted(prop[2]): 2006 icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname)) 2007 # Hardcode known values for the normalization quick check properties, 2008 # see unorm2.h for the UNormalizationCheckResult enum. 2009 icu_values = _pname_to_icu_prop["NFC_QC"][2] 2010 icu_values.append(("UNORM_NO", "N")) 2011 icu_values.append(("UNORM_YES", "Y")) 2012 icu_values.append(("UNORM_MAYBE", "M")) 2013 _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values) # Copy NFC -> NFKC. 2014 # No "maybe" values for NF[K]D. 2015 icu_values = _pname_to_icu_prop["NFD_QC"][2] 2016 icu_values.append(("UNORM_NO", "N")) 2017 icu_values.append(("UNORM_YES", "Y")) 2018 _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values) # Copy NFD -> NFKD. 2019 2020 2021# Sample line to match: 2022# USCRIPT_LOMA = 139,/* Loma */ 2023_uscript_re = re.compile( 2024 " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/") 2025 2026def ParseUScriptHeader(icu4c_src_root): 2027 uscript_path = os.path.join(icu4c_src_root, "source", 2028 "common", "unicode", "uscript.h") 2029 icu_values = _pname_to_icu_prop["sc"][2] 2030 with open(uscript_path, "r") as uscript_file: 2031 for line in uscript_file: 2032 match = _uscript_re.match(line) 2033 if match: 2034 (script_enum, script_code) = match.group(1, 2) 2035 icu_values.append((script_enum, script_code)) 2036 2037 2038def CheckPNamesData(): 2039 """Checks that every ICU property has a full set of value enum constants, 2040 and that the _icu_properties value names map back to the UCD.""" 2041 missing_enums = [] 2042 for (p_enum, pname, values) in _icu_properties: 2043 prop = _properties[pname] 2044 vnames = set(prop[2]) # Modifiable copy of the set of short value names. 2045 for (v_enum, vname) in values: 2046 if vname not in vnames: 2047 raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" % 2048 (pname, vname, v_enum)) 2049 vnames.remove(vname) 2050 # Exceptions to the all-values check: 2051 # - ICU does not have specific enum values for binary No/Yes. 2052 # - ICU represents Age values via UVersionInfo rather than enum constants. 2053 # - gc: ICU enum UCharCategory only has the single-category values. 2054 # (ICU's gcm property has all of the UCD gc property values.) 2055 if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")): 2056 missing_enums.append((pname, vnames)) 2057 if missing_enums: 2058 raise ValueError( 2059 "missing uchar.h enum constants for some property values: %s" % 2060 missing_enums) 2061 2062 2063def WritePNamesDataHeader(out_path): 2064 with open(out_path, "w") as out_file: 2065 out_file.write("""// © 2016 and later: Unicode, Inc. and others. 2066// License & terms of use: http://www.unicode.org/copyright.html 2067/** 2068 * Copyright (C) 2002-2016, International Business Machines Corporation and 2069 * others. All Rights Reserved. 2070 * 2071 * machine-generated by: icu/tools/unicode/py/preparseucd.py 2072 */ 2073 2074""") 2075 2076 # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties 2077 # and values in the order of their definition, 2078 # and this function writes them in that order. 2079 # Since the ICU API constants are stable and new values are only 2080 # appended at the end 2081 # (new properties are added at the end of each binary/enum/... range), 2082 # the output is stable as well. 2083 # When a property or value constant is renamed, 2084 # it only changes the name itself in the output; 2085 # it does not move in the output since there is no sorting. 2086 # This minimizes diffs and assists with reviewing and evaluating updates. 2087 2088 version = _ucd_version.split('.') 2089 while len(version) < 4: version.append("0") 2090 out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version)) 2091 2092 # Count the maximum number of aliases for any property or value. 2093 # We write the final value at the end. 2094 max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"])) 2095 2096 # Write an array of "binprop" Value object initializers 2097 # with the value aliases shared among all binary properties. 2098 out_file.write("static const Value VALUES_binprop[2] = {\n") 2099 out_file.write(' Value(0, "%s"),\n' % " ".join(_binary_values["N"])) 2100 out_file.write(' Value(1, "%s"),\n' % " ".join(_binary_values["Y"])) 2101 out_file.write("};\n\n") 2102 2103 # For each property with named values, write an array of 2104 # Value object initializers with the value enum and the aliases. 2105 for (p_enum, pname, values) in _icu_properties: 2106 prop = _properties[pname] 2107 aliases = prop[1] 2108 if len(aliases) > max_aliases: max_aliases = len(aliases) 2109 if not values: continue 2110 out_file.write("static const Value VALUES_%s[%d] = {\n" % 2111 (pname, len(values))) 2112 for (v_enum, vname) in values: 2113 aliases = _properties[pname][3][vname] 2114 # ccc, lccc, tccc: Omit the numeric strings from the aliases. 2115 # (See the comment about ccc in the PropertyValueAliases.txt header.) 2116 if pname.endswith("ccc"): aliases = aliases[1:] 2117 if len(aliases) > max_aliases: max_aliases = len(aliases) 2118 cast = "(int32_t)" if pname == "gcm" else "" 2119 out_file.write(' Value(%s%s, "%s"),\n' % 2120 (cast, v_enum, " ".join(aliases))) 2121 out_file.write("};\n\n") 2122 2123 # For each property, write a Property object initializer 2124 # with the property enum, its aliases, and a reference to its values. 2125 out_file.write("static const Property PROPERTIES[%d] = {\n" % 2126 len(_icu_properties)) 2127 for (enum, pname, values) in _icu_properties: 2128 prop = _properties[pname] 2129 aliases = " ".join(prop[1]) 2130 if prop[0] == "Binary": 2131 out_file.write(' Property(%s, "%s"),\n' % (enum, aliases)) 2132 elif values: # Property with named values. 2133 out_file.write(' Property(%s, "%s", VALUES_%s, %d),\n' % 2134 (enum, aliases, pname, len(values))) 2135 else: 2136 out_file.write(' Property(%s, "%s"),\n' % (enum, aliases)) 2137 out_file.write("};\n\n") 2138 2139 out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases) 2140 2141# main() ------------------------------------------------------------------- *** 2142 2143def main(): 2144 global _null_or_defaults 2145 only_ppucd = False 2146 if len(sys.argv) == 3: 2147 (ucd_root, icu_src_root) = sys.argv[1:3] 2148 ppucd_path = None 2149 elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd": 2150 # For debugging: 2151 # preparseucd.py path/to/UCD/root --only_ppucd path/to/ppucd/outputfile 2152 ucd_root = sys.argv[1] 2153 ppucd_path = sys.argv[3] 2154 only_ppucd = True 2155 icu_src_root = "/tmp/ppucd" 2156 else: 2157 print("Usage: %s path/to/UCD/root path/to/ICU/src/root" % sys.argv[0]) 2158 return 2159 icu4c_src_root = os.path.join(icu_src_root, "icu4c") 2160 icu_tools_root = os.path.join(icu_src_root, "tools") 2161 source_files = [] 2162 for root, dirs, files in os.walk(ucd_root): 2163 for file in files: 2164 source_files.append(os.path.join(root, file)) 2165 PreprocessFiles(source_files, icu4c_src_root) 2166 # Parse the processed files in a particular order. 2167 for files in _files_to_parse: 2168 for (basename, path, parser) in files: 2169 print("Parsing %s" % basename) 2170 value = _files[basename] 2171 # Unicode data files are in UTF-8. 2172 charset = "UTF-8" 2173 if basename == "NamesList.txt": 2174 # The NamesList used to be in Latin-1 before Unicode 6.2. 2175 numeric_ucd_version = [int(field) for field in _ucd_version.split('.')] 2176 if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1" 2177 in_file = codecs.open(path, "r", charset) 2178 with in_file: 2179 parser(in_file) 2180 _null_or_defaults = _null_values.copy() 2181 _null_or_defaults.update(_defaults) 2182 # Every Catalog and Enumerated property must have a default value, 2183 # from a @missing line. "nv" = "null value". 2184 pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"] 2185 if pnv: 2186 raise Exception("no default values (@missing lines) for " + 2187 "some Catalog or Enumerated properties: %s " % pnv) 2188 unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") 2189 if not only_ppucd: 2190 # Write Normalizer2 input text files. 2191 # Do this before compacting the data so that we need not handle fallbacks. 2192 norm2_path = os.path.join(unidata_path, "norm2") 2193 if not os.path.exists(norm2_path): os.makedirs(norm2_path) 2194 WriteNorm2(norm2_path) 2195 # Optimize block vs. cp properties. 2196 CompactBlocks() 2197 # Write the ppucd.txt output file. 2198 # Use US-ASCII so that ICU tests can parse it in the platform charset, 2199 # which may be EBCDIC. 2200 # Fix up non-ASCII data (NamesList.txt headings) to fit. 2201 if not ppucd_path: 2202 ppucd_path = os.path.join(unidata_path, "ppucd.txt") 2203 with codecs.open(ppucd_path, "w", "US-ASCII") as out_file: 2204 WritePreparsedUCD(out_file) 2205 out_file.flush() 2206 2207 # TODO: PrintNameStats() 2208 2209 if only_ppucd: return 2210 2211 # ICU data for property & value names API 2212 ParseUCharHeader(icu4c_src_root) 2213 ParseUScriptHeader(icu4c_src_root) 2214 CheckPNamesData() 2215 genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops") 2216 if not os.path.exists(genprops_path): os.makedirs(genprops_path) 2217 out_path = os.path.join(genprops_path, "pnames_data.h") 2218 WritePNamesDataHeader(out_path) 2219 2220 2221if __name__ == "__main__": 2222 main() 2223