1#!/bin/perl -w 2#******************************************************************* 3# COPYRIGHT: 4# Copyright (c) 2002-2006, International Business Machines Corporation and 5# others. All Rights Reserved. 6#******************************************************************* 7 8# This script reads in UCD files PropertyAliases.txt and 9# PropertyValueAliases.txt and correlates them with ICU enums 10# defined in uchar.h and uscript.h. It then outputs a header 11# file which contains all names and enums. The header is included 12# by the genpname tool C++ source file, which produces the actual 13# binary data file. 14# 15# See usage note below. 16# 17# TODO: The Property[Value]Alias.txt files state that they can support 18# more than 2 names per property|value. Currently (Unicode 3.2) there 19# are always 1 or 2 names. If more names were supported, presumably 20# the format would be something like: 21# nv ; Numeric_Value 22# nv ; Value_Numerique 23# CURRENTLY, this script assumes that there are 1 or two names. Any 24# duplicates it sees are flagged as an error. If multiple aliases 25# appear in a future version of Unicode, modify this script to support 26# that. 27# 28# NOTE: As of ICU 2.6, this script has been modified to know about the 29# pseudo-property gcm/General_Category_Mask, which corresponds to the 30# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property 31# corresponds to General_Category but is a bitmask value. It does not 32# exist in the UCD. Therefore, I special case it in several places 33# (search for General_Category_Mask and gcm). 34# 35# NOTE: As of ICU 2.6, this script reads an auxiliary data file, 36# SyntheticPropertyAliases.txt, containing property aliases not 37# present in the UCD but present in ICU. This file resides in the 38# same directory as this script. Its contents are merged into those 39# of PropertyAliases.txt as if the two files were appended. 40# 41# NOTE: The following names are handled specially. See script below 42# for details. 43# 44# T/True 45# F/False 46# No_Block 47# 48# Author: Alan Liu 49# Created: October 14 2002 50# Since: ICU 2.4 51 52use FileHandle; 53use strict; 54use Dumpvalue; 55 56my $DEBUG = 1; 57my $DUMPER = new Dumpvalue; 58 59my $count = @ARGV; 60my $ICU_DIR = shift() || ''; 61my $OUT_FILE = shift() || 'data.h'; 62my $HEADER_DIR = "$ICU_DIR/source/common/unicode"; 63my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata"; 64 65# Get the current year from the system 66my $YEAR = 1900+@{[localtime]}[5]; # Get the current year 67 68# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique 69my $propNA = 0; 70my $valueNA = 0; 71 72#---------------------------------------------------------------------- 73# Top level property keys for binary, enumerated, string, and double props 74my @TOP = qw( _bp _ep _sp _dp _mp ); 75 76# This hash governs how top level properties are grouped into output arrays. 77#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ], 78# "NO_VALUE" => [ '_sp', '_dp' ] );m 79#my %TOP_PROPS = ( "BINARY" => [ '_bp' ], 80# "ENUMERATED" => [ '_ep' ], 81# "STRING" => [ '_sp' ], 82# "DOUBLE" => [ '_dp' ] ); 83my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] ); 84 85my %PROP_TYPE = (Binary => "_bp", 86 String => "_sp", 87 Double => "_dp", 88 Enumerated => "_ep", 89 Bitmask => "_mp"); 90#---------------------------------------------------------------------- 91 92# Properties that are unsupported in ICU 93my %UNSUPPORTED = (Composition_Exclusion => 1, 94 Decomposition_Mapping => 1, 95 Expands_On_NFC => 1, 96 Expands_On_NFD => 1, 97 Expands_On_NFKC => 1, 98 Expands_On_NFKD => 1, 99 FC_NFKC_Closure => 1, 100 ID_Start_Exceptions => 1, 101 Special_Case_Condition => 1, 102 ); 103 104# Short names of properties that weren't seen in uchar.h. If the 105# properties weren't seen, don't complain about the property values 106# missing. 107my %MISSING_FROM_UCHAR; 108 109# Additional property aliases beyond short and long names, 110# like space in addition to WSpace and White_Space in Unicode 4.1. 111# Hashtable, maps long name to alias. 112# For example, maps White_Space->space. 113# 114# If multiple additional aliases are defined, 115# then they are separated in the value string with '|'. 116# For example, White_Space->space|outer_space 117my %additional_property_aliases; 118 119#---------------------------------------------------------------------- 120 121# Emitted class names 122my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property); 123 124if ($count < 1 || $count > 2 || 125 !-d $HEADER_DIR || 126 !-d $UNIDATA_DIR) { 127 my $me = $0; 128 $me =~ s|.+[/\\]||; 129 my $lm = ' ' x length($me); 130 print <<"END"; 131 132$me: Reads ICU4C headers and Unicode data files and creates 133$lm a C header file that is included by genpname. The header 134$lm file matches constants defined in the ICU4C headers with 135$lm property|value aliases in the Unicode data files. 136 137Usage: $me <icu_dir> [<out_file>] 138 139<icu_dir> ICU4C root directory, containing 140 source/common/unicode/uchar.h 141 source/common/unicode/uscript.h 142 source/data/unidata/Blocks.txt 143 source/data/unidata/PropertyAliases.txt 144 source/data/unidata/PropertyValueAliases.txt 145<out_file> File name of header to be written; 146 default is 'data.h'. 147 148The Unicode versions of all input files must match. 149END 150 exit(1); 151} 152 153my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR); 154 155if ($DEBUG) { 156 print "Merged hash:\n"; 157 for my $key (sort keys %$h) { 158 my $hh = $h->{$key}; 159 for my $subkey (sort keys %$hh) { 160 print "$key:$subkey:", $hh->{$subkey}, "\n"; 161 } 162 } 163} 164 165my $out = new FileHandle($OUT_FILE, 'w'); 166die "Error: Can't write to $OUT_FILE: $!" unless (defined $out); 167my $save = select($out); 168formatData($h, $version); 169select($save); 170$out->close(); 171 172exit(0); 173 174#---------------------------------------------------------------------- 175# From PropList.html: "The properties of the form Other_XXX 176# are used to generate properties in DerivedCoreProperties.txt. 177# They are not intended for general use, such as in APIs that 178# return property values. 179# Non_Break is not a valid property as of 3.2. 180sub isIgnoredProperty { 181 local $_ = shift; 182 /^Other_/i || /^Non_Break$/i; 183} 184 185# 'qc' is a pseudo-property matching any quick-check property 186# see PropertyValueAliases.txt file comments. 'binprop' is 187# a synthetic binary value alias "True"/"False", not present 188# in PropertyValueAliases.txt. 189sub isPseudoProperty { 190 $_[0] eq 'qc' || 191 $_[0] eq 'binprop'; 192} 193 194#---------------------------------------------------------------------- 195# Emit the combined data from headers and the Unicode database as a 196# C source code header file. 197# 198# @param ref to hash with the data 199# @param Unicode version, as a string 200sub formatData { 201 my $h = shift; 202 my $version = shift; 203 204 my $date = scalar localtime(); 205 print <<"END"; 206/** 207 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and 208 * others. All Rights Reserved. 209 * 210 * MACHINE GENERATED FILE. !!! Do not edit manually !!! 211 * 212 * Generated from 213 * uchar.h 214 * uscript.h 215 * Blocks.txt 216 * PropertyAliases.txt 217 * PropertyValueAliases.txt 218 * 219 * Date: $date 220 * Unicode version: $version 221 * Script: $0 222 */ 223 224END 225 226 #------------------------------------------------------------ 227 # Emit Unicode version 228 print "/* Unicode version $version */\n"; 229 my @v = split(/\./, $version); 230 push @v, '0' while (@v < 4); 231 for (my $i=0; $i<@v; ++$i) { 232 print "const uint8_t VERSION_$i = $v[$i];\n"; 233 } 234 print "\n"; 235 236 #------------------------------------------------------------ 237 # Emit String table 238 # [A table of all identifiers, that is, all long or short property 239 # or value names. The list need NOT be sorted; it will be sorted 240 # by the C program. Strings are referenced by their index into 241 # this table. After sorting, a REMAP[] array is used to map the 242 # old position indices to the new positions.] 243 my %strings; 244 for my $prop (sort keys %$h) { 245 my $hh = $h->{$prop}; 246 for my $enum (sort keys %$hh) { 247 my @a = split(/\|/, $hh->{$enum}); 248 for (@a) { 249 $strings{$_} = 1 if (length($_)); 250 } 251 } 252 } 253 my @strings = sort keys %strings; 254 unshift @strings, ""; 255 256 print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n"; 257 258 # while printing, create a mapping hash from string table entry to index 259 my %stringToID; 260 print "/* to be sorted */\n"; 261 print "const $STRING_CLASS STRING_TABLE[] = {\n"; 262 for (my $i=0; $i<@strings; ++$i) { 263 print " $STRING_CLASS(\"$strings[$i]\", $i),\n"; 264 $stringToID{$strings[$i]} = $i; 265 } 266 print "};\n\n"; 267 268 # placeholder for the remapping index. this is used to map 269 # indices that we compute here to indices of the sorted 270 # STRING_TABLE. STRING_TABLE will be sorted by the C++ program 271 # using the uprv_comparePropertyNames() function. this will 272 # reshuffle the order. we then use the indices (passed to the 273 # String constructor) to create a REMAP[] array. 274 print "/* to be filled in */\n"; 275 print "int32_t REMAP[", scalar @strings, "];\n\n"; 276 277 #------------------------------------------------------------ 278 # Emit the name group table 279 # [A table of name groups. A name group is one or more names 280 # for a property or property value. The Unicode data files specify 281 # that there may be more than 2, although as of Unicode 3.2 there 282 # are at most 2. The name group table looks like this: 283 # 284 # 114, -115, 116, -117, 0, -118, 65, -64, ... 285 # [0] [2] [4] [6] 286 # 287 # The entry at [0] consists of 2 strings, 114 and 115. 288 # The entry at [2] consists of 116 and 117. The entry at 289 # [4] is one string, 118. There is always at least one 290 # string; typically there are two. If there are two, the first 291 # is the SHORT name and the second is the LONG. If there is 292 # one, then the missing entry (always the short name, in 3.2) 293 # is zero, which is by definition the index of "". The 294 # 'preferred' name will generally be the LONG name, if there are 295 # more than 2 entries. The last entry is negative. 296 297 # Build name group list and replace string refs with nameGroup indices 298 my @nameGroups; 299 300 # Check for duplicate name groups, and reuse them if possible 301 my %groupToInt; # Map group strings to ints 302 for my $prop (sort keys %$h) { 303 my $hh = $h->{$prop}; 304 for my $enum (sort keys %$hh) { 305 my $groupString = $hh->{$enum}; 306 my $i; 307 if (exists $groupToInt{$groupString}) { 308 $i = $groupToInt{$groupString}; 309 } else { 310 my @names = split(/\|/, $groupString); 311 die "Error: Wrong number of names in " . $groupString if (@names < 1); 312 $i = @nameGroups; # index of group we are making 313 $groupToInt{$groupString} = $i; # Cache for reuse 314 push @nameGroups, map { $stringToID{$_} } @names; 315 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end 316 } 317 # now, replace string list with ref to name group 318 $hh->{$enum} = $i; 319 } 320 } 321 322 print "const int32_t NAME_GROUP_COUNT = ", 323 scalar @nameGroups, ";\n\n"; 324 325 print "int32_t NAME_GROUP[] = {\n"; 326 # emit one group per line, with annotations 327 my $max_names = 0; 328 for (my $i=0; $i<@nameGroups; ) { 329 my @a; 330 my $line; 331 my $start = $i; 332 for (;;) { 333 my $j = $nameGroups[$i++]; 334 $line .= "$j, "; 335 push @a, abs($j); 336 last if ($j < 0); 337 } 338 print " ", 339 $line, 340 ' 'x(20-length($line)), 341 "/* ", sprintf("%3d", $start), 342 ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n"; 343 $max_names = @a if(@a > $max_names); 344 345 } 346 print "};\n\n"; 347 348 # This is fixed for 3.2 at "2" but should be calculated dynamically 349 # when more than 2 names appear in Property[Value]Aliases.txt. 350 print "#define MAX_NAMES_PER_GROUP $max_names\n\n"; 351 352 #------------------------------------------------------------ 353 # Emit enumerated property values 354 for my $prop (sort keys %$h) { 355 next if ($prop =~ /^_/); 356 my $vh = $h->{$prop}; 357 my $count = scalar keys %$vh; 358 359 print "const int32_t VALUES_${prop}_COUNT = ", 360 $count, ";\n\n"; 361 362 print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n"; 363 for my $enum (sort keys %$vh) { 364 #my @names = split(/\|/, $vh->{$enum}); 365 #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]" 366 # if (@names != 2); 367 print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n"; 368 #$stringToID{$names[0]}, ", ", 369 #$stringToID{$names[1]}, "),\n"; 370 # "\"", $names[0], "\", ", 371 # "\"", $names[1], "\"),\n"; 372 } 373 print "};\n\n"; 374 } 375 376 #------------------------------------------------------------ 377 # Emit top-level properties (binary, enumerated, etc.) 378 for my $topName (sort keys %TOP_PROPS) { 379 my $a = $TOP_PROPS{$topName}; 380 my $count = 0; 381 for my $type (@$a) { # "_bp", "_ep", etc. 382 $count += scalar keys %{$h->{$type}}; 383 } 384 385 print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n"; 386 387 print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n"; 388 389 for my $type (@$a) { # "_bp", "_ep", etc. 390 my $p = $h->{$type}; 391 392 for my $enum (sort keys %$p) { 393 my $name = $strings[$nameGroups[$p->{$enum}]]; 394 395 my $valueRef = "0, NULL"; 396 if ($type eq '_bp') { 397 $valueRef = "VALUES_binprop_COUNT, VALUES_binprop"; 398 } 399 elsif (exists $h->{$name}) { 400 $valueRef = "VALUES_${name}_COUNT, VALUES_$name"; 401 } 402 403 print " $PROPERTY_CLASS((int32_t) $enum, ", 404 $p->{$enum}, ", $valueRef),\n"; 405 } 406 } 407 print "};\n\n"; 408 } 409 410 print "/*eof*/\n"; 411} 412 413#---------------------------------------------------------------------- 414# Read in the files uchar.h, uscript.h, Blocks.txt, 415# PropertyAliases.txt, and PropertyValueAliases.txt, 416# and combine them into one hash. 417# 418# @param directory containing headers 419# @param directory containin Unicode data files 420# 421# @return hash ref, Unicode version 422sub readAndMerge { 423 424 my ($headerDir, $unidataDir) = @_; 425 426 my $h = read_uchar("$headerDir/uchar.h"); 427 my $s = read_uscript("$headerDir/uscript.h"); 428 my $b = read_Blocks("$unidataDir/Blocks.txt"); 429 my $pa = {}; 430 read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt"); 431 read_PropertyAliases($pa, "SyntheticPropertyAliases.txt"); 432 my $va = {}; 433 read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt"); 434 read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt"); 435 436 # Extract property family hash 437 my $fam = $pa->{'_family'}; 438 delete $pa->{'_family'}; 439 440 # Note: uscript.h has no version string, so don't check it 441 my $version = check_versions([ 'uchar.h', $h ], 442 [ 'Blocks.txt', $b ], 443 [ 'PropertyAliases.txt', $pa ], 444 [ 'PropertyValueAliases.txt', $va ]); 445 446 # Do this BEFORE merging; merging modifies the hashes 447 check_PropertyValueAliases($pa, $va); 448 449 # Dump out the $va hash for debugging 450 if ($DEBUG) { 451 print "Property values hash:\n"; 452 for my $key (sort keys %$va) { 453 my $hh = $va->{$key}; 454 for my $subkey (sort keys %$hh) { 455 print "$key:$subkey:", $hh->{$subkey}, "\n"; 456 } 457 } 458 } 459 460 # Dump out the $s hash for debugging 461 if ($DEBUG) { 462 print "Script hash:\n"; 463 for my $key (sort keys %$s) { 464 print "$key:", $s->{$key}, "\n"; 465 } 466 } 467 468 # Link in the script data 469 $h->{'sc'} = $s; 470 471 merge_Blocks($h, $b); 472 473 merge_PropertyAliases($h, $pa, $fam); 474 475 merge_PropertyValueAliases($h, $va); 476 477 ($h, $version); 478} 479 480#---------------------------------------------------------------------- 481# Ensure that the version strings in the given hashes (under the key 482# '_version') are compatible. Currently this means they must be 483# identical, with the exception that "X.Y" will match "X.Y.0". 484# All hashes must define the key '_version'. 485# 486# @param a list of pairs of (file name, hash reference) 487# 488# @return the version of all the hashes. Upon return, the '_version' 489# will be removed from all hashes. 490sub check_versions { 491 my $version = ''; 492 my $msg = ''; 493 foreach my $a (@_) { 494 my $name = $a->[0]; 495 my $h = $a->[1]; 496 die "Error: No version found" unless (exists $h->{'_version'}); 497 my $v = $h->{'_version'}; 498 delete $h->{'_version'}; 499 500 # append ".0" if necessary, to standardize to X.Y.Z 501 $v .= '.0' unless ($v =~ /\.\d+\./); 502 $v .= '.0' unless ($v =~ /\.\d+\./); 503 $msg .= "$name = $v\n"; 504 if ($version) { 505 die "Error: Mismatched Unicode versions\n$msg" 506 unless ($version eq $v); 507 } else { 508 $version = $v; 509 } 510 } 511 $version; 512} 513 514#---------------------------------------------------------------------- 515# Make sure the property names in PropertyValueAliases.txt match those 516# in PropertyAliases.txt. 517# 518# @param a hash ref from read_PropertyAliases. 519# @param a hash ref from read_PropertyValueAliases. 520sub check_PropertyValueAliases { 521 my ($pa, $va) = @_; 522 523 # make a reverse hash of short->long 524 my %rev; 525 for (keys %$pa) { $rev{$pa->{$_}} = $_; } 526 527 for my $prop (keys %$va) { 528 if (!exists $rev{$prop} && !isPseudoProperty($prop)) { 529 print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n"; 530 } 531 } 532} 533 534#---------------------------------------------------------------------- 535# Merge blocks data into uchar.h enum data. In the 'blk' subhash all 536# code point values, as returned from read_uchar, are replaced by 537# block names, as read from Blocks.txt and returned by read_Blocks. 538# The match must be 1-to-1. If there is any failure of 1-to-1 539# mapping, an error is signaled. Upon return, the read_Blocks hash 540# is emptied of all contents, except for those that failed to match. 541# 542# The mapping in the 'blk' subhash, after this function returns, is 543# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h 544# pseudo-name, e.g. "Basic Latin". 545# 546# @param a hash ref from read_uchar. 547# @param a hash ref from read_Blocks. 548sub merge_Blocks { 549 my ($h, $b) = @_; 550 551 die "Error: No blocks data in uchar.h" 552 unless (exists $h->{'blk'}); 553 my $blk = $h->{'blk'}; 554 for my $enum (keys %$blk) { 555 my $cp = $blk->{$enum}; 556 if ($cp && !exists $b->{$cp}) { 557 die "Error: No block found at $cp in Blocks.txt"; 558 } 559 # Convert code point to pseudo-name: 560 $blk->{$enum} = $b->{$cp}; 561 delete $b->{$cp}; 562 } 563 my $err = ''; 564 for my $cp (keys %$b) { 565 $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n"; 566 } 567 die $err if ($err); 568} 569 570#---------------------------------------------------------------------- 571# Merge property alias names into the uchar.h hash. The subhashes 572# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are 573# examined and the values of those subhashes are assumed to be long 574# names in PropertyAliases.txt. They are validated and replaced by 575# "<short>|<long>". Upon return, the read_PropertyAliases hash is 576# emptied of all contents, except for those that failed to match. 577# Unmatched names in PropertyAliases are listed as a warning but do 578# NOT cause the script to die. 579# 580# @param a hash ref from read_uchar. 581# @param a hash ref from read_PropertyAliases. 582# @param a hash mapping long names to property family (e.g., 'binary') 583sub merge_PropertyAliases { 584 my ($h, $pa, $fam) = @_; 585 586 for my $k (@TOP) { 587 die "Error: No properties data for $k in uchar.h" 588 unless (exists $h->{$k}); 589 } 590 591 for my $subh (map { $h->{$_} } @TOP) { 592 for my $enum (keys %$subh) { 593 my $long_name = $subh->{$enum}; 594 if (!exists $pa->{$long_name}) { 595 die "Error: Property $long_name not found (or used more than once)"; 596 } 597 598 my $value; 599 if($pa->{$long_name} =~ m|^n/a\d*$|) { 600 # replace an "n/a" short name with an empty name (nothing before "|"); 601 # don't remove it (don't remove the "|"): there must always be a long name, 602 # and if the short name is removed, then the long name becomes the 603 # short name and there is no long name left (unless there is another alias) 604 $value = "|" . $long_name; 605 } else { 606 $value = $pa->{$long_name} . "|" . $long_name; 607 } 608 if (exists $additional_property_aliases{$long_name}) { 609 $value .= "|" . $additional_property_aliases{$long_name}; 610 } 611 $subh->{$enum} = $value; 612 delete $pa->{$long_name}; 613 } 614 } 615 616 my @err; 617 for my $name (keys %$pa) { 618 $MISSING_FROM_UCHAR{$pa->{$name}} = 1; 619 if (exists $UNSUPPORTED{$name}) { 620 push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h"; 621 } elsif (!isIgnoredProperty($name)) { 622 push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h"; 623 } 624 } 625 print join("\n", sort @err), "\n" if (@err); 626} 627 628#---------------------------------------------------------------------- 629# Return 1 if two names match ignoring whitespace, '-', and '_'. 630# Used to match names in Blocks.txt with those in PropertyValueAliases.txt 631# as of Unicode 4.0. 632sub matchesLoosely { 633 my ($a, $b) = @_; 634 $a =~ s/[\s\-_]//g; 635 $b =~ s/[\s\-_]//g; 636 $a =~ /^$b$/i; 637} 638 639#---------------------------------------------------------------------- 640# Merge PropertyValueAliases.txt data into the uchar.h hash. All 641# properties other than blk, _bp, and _ep are analyzed and mapped to 642# the names listed in PropertyValueAliases. They are then replaced 643# with a string of the form "<short>|<long>". The short or long name 644# may be missing. 645# 646# @param a hash ref from read_uchar. 647# @param a hash ref from read_PropertyValueAliases. 648sub merge_PropertyValueAliases { 649 my ($h, $va) = @_; 650 651 my %gcCount; 652 for my $prop (keys %$h) { 653 # _bp, _ep handled in merge_PropertyAliases 654 next if ($prop =~ /^_/); 655 656 # Special case: gcm 657 my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop; 658 659 # find corresponding PropertyValueAliases data 660 die "Error: Can't find $prop in PropertyValueAliases.txt" 661 unless (exists $va->{$prop2}); 662 my $pva = $va->{$prop2}; 663 664 # match up data 665 my $hh = $h->{$prop}; 666 for my $enum (keys %$hh) { 667 668 my $name = $hh->{$enum}; 669 670 # look up both long and short & ignore case 671 my $n; 672 if (exists $pva->{$name}) { 673 $n = $name; 674 } else { 675 # iterate (slow) 676 for my $a (keys %$pva) { 677 # case-insensitive match 678 # & case-insensitive reverse match 679 if ($a =~ /^$name$/i || 680 $pva->{$a} =~ /^$name$/i) { 681 $n = $a; 682 last; 683 } 684 } 685 } 686 687 # For blocks, do a loose match from Blocks.txt pseudo-name 688 # to PropertyValueAliases long name. 689 if (!$n && $prop eq 'blk') { 690 for my $a (keys %$pva) { 691 # The block is only going to match the long name, 692 # but we check both for completeness. As of Unicode 693 # 4.0, blocks do not have short names. 694 if (matchesLoosely($name, $pva->{$a}) || 695 matchesLoosely($name, $a)) { 696 $n = $a; 697 last; 698 } 699 } 700 } 701 702 die "Error: Property value $prop:$name not found" unless ($n); 703 704 my $l = $n; 705 my $r = $pva->{$n}; 706 # convert |n/a\d*| to blank 707 $l = '' if ($l =~ m|^n/a\d*$|); 708 $r = '' if ($r =~ m|^n/a\d*$|); 709 710 $hh->{$enum} = "$l|$r"; 711 # Don't delete the 'gc' properties because we need to share 712 # them between 'gc' and 'gcm'. Count each use instead. 713 if ($prop2 eq 'gc') { 714 ++$gcCount{$n}; 715 } else { 716 delete $pva->{$n}; 717 } 718 } 719 } 720 721 # Merge the combining class values in manually 722 # Add the same values to the synthetic lccc and tccc properties 723 die "Error: No ccc data" 724 unless exists $va->{'ccc'}; 725 for my $ccc (keys %{$va->{'ccc'}}) { 726 die "Error: Can't overwrite ccc $ccc" 727 if (exists $h->{'ccc'}->{$ccc}); 728 $h->{'lccc'}->{$ccc} = 729 $h->{'tccc'}->{$ccc} = 730 $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc}; 731 } 732 delete $va->{'ccc'}; 733 734 # Merge synthetic binary property values in manually. 735 # These are the "True" and "False" value aliases. 736 die "Error: No True/False value aliases" 737 unless exists $va->{'binprop'}; 738 for my $bp (keys %{$va->{'binprop'}}) { 739 $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp}; 740 } 741 delete $va->{'binprop'}; 742 743 my $err = ''; 744 for my $prop (sort keys %$va) { 745 my $hh = $va->{$prop}; 746 for my $subkey (sort keys %$hh) { 747 # 'gc' props are shared with 'gcm'; make sure they were used 748 # once or twice. 749 if ($prop eq 'gc') { 750 my $n = $gcCount{$subkey}; 751 next if ($n >= 1 && $n <= 2); 752 } 753 $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n" 754 unless exists $MISSING_FROM_UCHAR{$prop}; 755 } 756 } 757 print $err if ($err); 758} 759 760#---------------------------------------------------------------------- 761# Read the PropertyAliases.txt file. Return a hash that maps the long 762# name to the short name. The special key '_version' will map to the 763# Unicode version of the file. The special key '_family' holds a 764# subhash that maps long names to a family string, for descriptive 765# purposes. 766# 767# @param a filename for PropertyAliases.txt 768# @param reference to hash to receive data. Keys are long names. 769# Values are short names. 770sub read_PropertyAliases { 771 772 my $hash = shift; # result 773 774 my $filename = shift; 775 776 my $fam = {}; # map long names to family string 777 $fam = $hash->{'_family'} if (exists $hash->{'_family'}); 778 779 my $family; # binary, enumerated, etc. 780 781 my $in = new FileHandle($filename, 'r'); 782 die "Error: Cannot open $filename" if (!defined $in); 783 784 while (<$in>) { 785 786 # Read version (embedded in a comment) 787 if (/PropertyAliases-(\d+\.\d+\.\d+)/i) { 788 die "Error: Multiple versions in $filename" 789 if (exists $hash->{'_version'}); 790 $hash->{'_version'} = $1; 791 } 792 793 # Read family heading 794 if (/^\s*\#\s*(.+?)\s*Properties\s*$/) { 795 $family = $1; 796 } 797 798 # Ignore comments and blank lines 799 s/\#.*//; 800 next unless (/\S/); 801 802 if (/^\s*(.+?)\s*;/) { 803 my $short = $1; 804 my @fields = /;\s*([^\s;]+)/g; 805 if (@fields < 1 || @fields > 2) { 806 my $number = @fields; 807 die "Error: Wrong number of fields ($number) in $filename at $_"; 808 } 809 810 # Make "n/a" strings unique 811 if ($short eq 'n/a') { 812 $short .= sprintf("%03d", $propNA++); 813 } 814 my $long = $fields[0]; 815 if ($long eq 'n/a') { 816 $long .= sprintf("%03d", $propNA++); 817 } 818 819 # Add long name->short name to the hash=pa hash table 820 if (exists $hash->{$long}) { 821 die "Error: Duplicate property $long in $filename" 822 } 823 $hash->{$long} = $short; 824 $fam->{$long} = $family; 825 826 # Add the list of further aliases to the additional_property_aliases hash table, 827 # using the long property name as the key. 828 # For example: 829 # White_Space->space|outer_space 830 if (@fields > 1) { 831 my $value = pop @fields; 832 while (@fields > 1) { 833 $value .= "|" . pop @fields; 834 } 835 $additional_property_aliases{$long} = $value; 836 } 837 } else { 838 die "Error: Can't parse $_ in $filename"; 839 } 840 } 841 842 $in->close(); 843 844 $hash->{'_family'} = $fam; 845} 846 847#---------------------------------------------------------------------- 848# Read the PropertyValueAliases.txt file. Return a two level hash 849# that maps property_short_name:value_short_name:value_long_name. In 850# the case of the 'ccc' property, the short name is the numeric class 851# and the long name is "<short>|<long>". The special key '_version' 852# will map to the Unicode version of the file. 853# 854# @param a filename for PropertyValueAliases.txt 855# 856# @return a hash reference. 857sub read_PropertyValueAliases { 858 859 my $hash = shift; # result 860 861 my $filename = shift; 862 863 my $in = new FileHandle($filename, 'r'); 864 die "Error: Cannot open $filename" if (!defined $in); 865 866 while (<$in>) { 867 868 # Read version (embedded in a comment) 869 if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) { 870 die "Error: Multiple versions in $filename" 871 if (exists $hash->{'_version'}); 872 $hash->{'_version'} = $1; 873 } 874 875 # Ignore comments and blank lines 876 s/\#.*//; 877 next unless (/\S/); 878 879 if (/^\s*(.+?)\s*;/i) { 880 my $prop = $1; 881 my @fields = /;\s*([^\s;]+)/g; 882 die "Error: Wrong number of fields in $filename" 883 if (@fields < 2 || @fields > 3); 884 # Make "n/a" strings unique 885 $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a'); 886 # Squash extra fields together 887 while (@fields > 2) { 888 my $f = pop @fields; 889 $fields[$#fields] .= '|' . $f; 890 } 891 addDatum($hash, $prop, @fields); 892 } 893 894 else { 895 die "Error: Can't parse $_ in $filename"; 896 } 897 } 898 899 $in->close(); 900 901 # Script Copt=Qaac (Coptic) is a special case. 902 # Before the Copt code was defined, the private-use code Qaac was used. 903 # Starting with Unicode 4.1, PropertyValueAliases.txt contains 904 # Copt as the short name as well as Qaac as an alias. 905 # For use with older Unicode data files, we add here a Qaac->Coptic entry. 906 # This should not do anything for 4.1-and-later Unicode data files. 907 # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/ 908 $hash->{'sc'}->{'Qaac'} = 'Coptic' 909 unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'}); 910 911 # Add T|True and F|False -- these are values we recognize for 912 # binary properties (NOT from PropertyValueAliases.txt). These 913 # are of the same form as the 'ccc' value aliases. 914 $hash->{'binprop'}->{'0'} = 'F|False'; 915 $hash->{'binprop'}->{'1'} = 'T|True'; 916} 917 918#---------------------------------------------------------------------- 919# Read the Blocks.txt file. Return a hash that maps the code point 920# range start to the block name. The special key '_version' will map 921# to the Unicode version of the file. 922# 923# As of Unicode 4.0, the names in the Blocks.txt are no longer the 924# proper names. The proper names are now listed in PropertyValueAliases. 925# They are similar but not identical. Furthermore, 4.0 introduces 926# a new block name, No_Block, which is listed only in PropertyValueAliases 927# and not in Blocks.txt. As a result, we handle blocks as follows: 928# 929# 1. Read Blocks.txt to map code point range start to quasi-block name. 930# 2. Add to Blocks.txt a synthetic No Block code point & name: 931# X -> No Block 932# 3. Map quasi-names from Blocks.txt (including No Block) to actual 933# names from PropertyValueAliases. This occurs in 934# merge_PropertyValueAliases. 935# 936# @param a filename for Blocks.txt 937# 938# @return a ref to a hash. Keys are code points, as text, e.g., 939# "1720". Values are pseudo-block names, e.g., "Hanunoo". 940sub read_Blocks { 941 942 my $filename = shift; 943 944 my $hash = {}; # result 945 946 my $in = new FileHandle($filename, 'r'); 947 die "Error: Cannot open $filename" if (!defined $in); 948 949 while (<$in>) { 950 951 # Read version (embedded in a comment) 952 if (/Blocks-(\d+\.\d+\.\d+)/i) { 953 die "Error: Multiple versions in $filename" 954 if (exists $hash->{'_version'}); 955 $hash->{'_version'} = $1; 956 } 957 958 # Ignore comments and blank lines 959 s/\#.*//; 960 next unless (/\S/); 961 962 if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) { 963 die "Error: Duplicate range $1 in $filename" 964 if (exists $hash->{$1}); 965 $hash->{$1} = $2; 966 } 967 968 else { 969 die "Error: Can't parse $_ in $filename"; 970 } 971 } 972 973 $in->close(); 974 975 # Add pseudo-name for No Block 976 $hash->{'none'} = 'No Block'; 977 978 $hash; 979} 980 981#---------------------------------------------------------------------- 982# Read the uscript.h file and compile a mapping of Unicode symbols to 983# icu4c enum values. 984# 985# @param a filename for uscript.h 986# 987# @return a ref to a hash. The keys of the hash are enum symbols from 988# uscript.h, and the values are script names. 989sub read_uscript { 990 991 my $filename = shift; 992 993 my $mode = ''; # state machine mode and submode 994 my $submode = ''; 995 996 my $last = ''; # for line folding 997 998 my $hash = {}; # result 999 my $key; # first-level key 1000 1001 my $in = new FileHandle($filename, 'r'); 1002 die "Error: Cannot open $filename" if (!defined $in); 1003 1004 while (<$in>) { 1005 # Fold continued lines together 1006 if (/^(.*)\\$/) { 1007 $last = $1; 1008 next; 1009 } elsif ($last) { 1010 $_ = $last . $_; 1011 $last = ''; 1012 } 1013 1014 # Exit all modes here 1015 if ($mode && $mode ne 'DEPRECATED') { 1016 if (/^\s*\}/) { 1017 $mode = ''; 1018 next; 1019 } 1020 } 1021 1022 # Handle individual modes 1023 1024 if ($mode eq 'UScriptCode') { 1025 if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) { 1026 my ($enum, $code) = ($1, $2); 1027 die "Error: Duplicate script $enum" 1028 if (exists $hash->{$enum}); 1029 $hash->{$enum} = $code; 1030 } 1031 } 1032 1033 elsif ($mode eq 'DEPRECATED') { 1034 if (/\s*\#ifdef/) { 1035 die "Error: Nested #ifdef"; 1036 } 1037 elsif (/\s*\#endif/) { 1038 $mode = ''; 1039 } 1040 } 1041 1042 elsif (!$mode) { 1043 if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || 1044 /^\s*typedef\s+enum\s+(\w+)\s*$/) { 1045 $mode = $1; 1046 #print "Parsing $mode\n"; 1047 } 1048 1049 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { 1050 $mode = 'DEPRECATED'; 1051 } 1052 } 1053 } 1054 1055 $in->close(); 1056 1057 $hash; 1058} 1059 1060#---------------------------------------------------------------------- 1061# Read the uchar.h file and compile a mapping of Unicode symbols to 1062# icu4c enum values. 1063# 1064# @param a filename for uchar.h 1065# 1066# @return a ref to a hash. The keys of the hash are '_bp' for binary 1067# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for 1068# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk', 1069# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property 1070# value aliases. The values of the hash are subhashes. The subhashes 1071# have a key of the uchar.h enum symbol, and a value of the alias 1072# string (as listed in PropertyValueAliases.txt). NOTE: The alias 1073# string is whatever alias uchar.h lists. This may be either short or 1074# long, depending on the specific enum. NOTE: For blocks ('blk'), the 1075# value is a hex code point for the start of the associated block. 1076# NOTE: The special key _version will map to the Unicode version of 1077# the file. 1078sub read_uchar { 1079 1080 my $filename = shift; 1081 1082 my $mode = ''; # state machine mode and submode 1083 my $submode = ''; 1084 1085 my $last = ''; # for line folding 1086 1087 my $hash = {}; # result 1088 my $key; # first-level key 1089 1090 my $in = new FileHandle($filename, 'r'); 1091 die "Error: Cannot open $filename" if (!defined $in); 1092 1093 while (<$in>) { 1094 # Fold continued lines together 1095 if (/^(.*)\\$/) { 1096 $last .= $1; 1097 next; 1098 } elsif ($last) { 1099 $_ = $last . $_; 1100 $last = ''; 1101 } 1102 1103 # Exit all modes here 1104 if ($mode && $mode ne 'DEPRECATED') { 1105 if (/^\s*\}/) { 1106 $mode = ''; 1107 next; 1108 } 1109 } 1110 1111 # Handle individual modes 1112 1113 if ($mode eq 'UProperty') { 1114 if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) { 1115 if ($submode) { 1116 addDatum($hash, $key, $1, $submode); 1117 $submode = ''; 1118 } else { 1119 #print "Warning: Ignoring $1\n"; 1120 } 1121 } 1122 1123 elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) { 1124 die "Error: Unmatched tag $submode" if ($submode); 1125 die "Error: Unrecognized UProperty comment: $_" 1126 unless (exists $PROP_TYPE{$1}); 1127 $key = $PROP_TYPE{$1}; 1128 $submode = $2; 1129 } 1130 } 1131 1132 elsif ($mode eq 'UCharCategory') { 1133 if (/^\s*(U_\w+)\s*=/) { 1134 if ($submode) { 1135 addDatum($hash, 'gc', $1, $submode); 1136 $submode = ''; 1137 } else { 1138 #print "Warning: Ignoring $1\n"; 1139 } 1140 } 1141 1142 elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) { 1143 die "Error: Unmatched tag $submode" if ($submode); 1144 $submode = $1; 1145 } 1146 } 1147 1148 elsif ($mode eq 'UCharDirection') { 1149 if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) { 1150 if ($submode) { 1151 addDatum($hash, $key, $1, $submode); 1152 $submode = ''; 1153 } else { 1154 #print "Warning: Ignoring $1\n"; 1155 } 1156 } 1157 1158 elsif (m|/\*\*\s*([A-Z]+)\s|) { 1159 die "Error: Unmatched tag $submode" if ($submode); 1160 $key = 'bc'; 1161 $submode = $1; 1162 } 1163 } 1164 1165 elsif ($mode eq 'UBlockCode') { 1166 if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) { 1167 addDatum($hash, 'blk', $1, $2); 1168 } 1169 } 1170 1171 elsif ($mode eq 'UEastAsianWidth') { 1172 if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) { 1173 addDatum($hash, 'ea', $1, $2); 1174 } 1175 } 1176 1177 elsif ($mode eq 'UDecompositionType') { 1178 if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) { 1179 addDatum($hash, 'dt', $1, $2); 1180 } 1181 } 1182 1183 elsif ($mode eq 'UJoiningType') { 1184 if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) { 1185 addDatum($hash, 'jt', $1, $2); 1186 } 1187 } 1188 1189 elsif ($mode eq 'UJoiningGroup') { 1190 if (/^\s*(U_JG_(\w+))/) { 1191 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT'); 1192 } 1193 } 1194 1195 elsif ($mode eq 'UGraphemeClusterBreak') { 1196 if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) { 1197 addDatum($hash, 'GCB', $1, $2); 1198 } 1199 } 1200 1201 elsif ($mode eq 'UWordBreakValues') { 1202 if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) { 1203 addDatum($hash, 'WB', $1, $2); 1204 } 1205 } 1206 1207 elsif ($mode eq 'USentenceBreak') { 1208 if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) { 1209 addDatum($hash, 'SB', $1, $2); 1210 } 1211 } 1212 1213 elsif ($mode eq 'ULineBreak') { 1214 if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) { 1215 addDatum($hash, 'lb', $1, $2); 1216 } 1217 } 1218 1219 elsif ($mode eq 'UNumericType') { 1220 if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) { 1221 addDatum($hash, 'nt', $1, $2); 1222 } 1223 } 1224 1225 elsif ($mode eq 'UHangulSyllableType') { 1226 if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) { 1227 addDatum($hash, 'hst', $1, $2); 1228 } 1229 } 1230 1231 elsif ($mode eq 'DEPRECATED') { 1232 if (/\s*\#ifdef/) { 1233 die "Error: Nested #ifdef"; 1234 } 1235 elsif (/\s*\#endif/) { 1236 $mode = ''; 1237 } 1238 } 1239 1240 elsif (!$mode) { 1241 if (/^\s*\#define\s+(\w+)\s+(.+)/) { 1242 # #define $left $right 1243 my ($left, $right) = ($1, $2); 1244 1245 if ($left eq 'U_UNICODE_VERSION') { 1246 my $version = $right; 1247 $version = $1 if ($version =~ /^\"(.*)\"/); 1248 # print "Unicode version: ", $version, "\n"; 1249 die "Error: Multiple versions in $filename" 1250 if (defined $hash->{'_version'}); 1251 $hash->{'_version'} = $version; 1252 } 1253 1254 elsif ($left =~ /U_GC_(\w+?)_MASK/) { 1255 addDatum($hash, 'gcm', $left, $1); 1256 } 1257 } 1258 1259 elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || 1260 /^\s*typedef\s+enum\s+(\w+)\s*$/) { 1261 $mode = $1; 1262 #print "Parsing $mode\n"; 1263 } 1264 1265 elsif (/^\s*enum\s+(\w+)\s*\{/ || 1266 /^\s*enum\s+(\w+)\s*$/) { 1267 $mode = $1; 1268 #print "Parsing $mode\n"; 1269 } 1270 1271 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { 1272 $mode = 'DEPRECATED'; 1273 } 1274 } 1275 } 1276 1277 $in->close(); 1278 1279 # hardcode known values for the normalization quick check properties 1280 # see unorm.h for the UNormalizationCheckResult enum 1281 1282 addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N'); 1283 addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y'); 1284 addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M'); 1285 1286 addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N'); 1287 addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y'); 1288 addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M'); 1289 1290 # no "maybe" values for NF[K]D 1291 1292 addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N'); 1293 addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y'); 1294 1295 addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N'); 1296 addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y'); 1297 1298 $hash; 1299} 1300 1301#---------------------------------------------------------------------- 1302# Add a new value to a two-level hash. That is, given a ref to 1303# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value. 1304sub addDatum { 1305 my ($h, $k1, $k2, $v) = @_; 1306 if (exists $h->{$k1}->{$k2}) { 1307 die "Error: $k1:$k2 already set to " . 1308 $h->{$k1}->{$k2} . ", cannot set to " . $v; 1309 } 1310 $h->{$k1}->{$k2} = $v; 1311} 1312 1313#eof 1314