1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from icutools.databuilder import * 9from icutools.databuilder import utils 10from icutools.databuilder.request_types import * 11 12import os 13import sys 14 15 16def generate(config, io, common_vars): 17 requests = [] 18 19 if len(io.glob("misc/*")) == 0: 20 print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr) 21 exit(1) 22 23 requests += generate_cnvalias(config, io, common_vars) 24 requests += generate_ulayout(config, io, common_vars) 25 requests += generate_uemoji(config, io, common_vars) 26 requests += generate_confusables(config, io, common_vars) 27 requests += generate_conversion_mappings(config, io, common_vars) 28 requests += generate_brkitr_brk(config, io, common_vars) 29 requests += generate_brkitr_lstm(config, io, common_vars) 30 requests += generate_brkitr_adaboost(config, io, common_vars) 31 requests += generate_stringprep(config, io, common_vars) 32 requests += generate_brkitr_dictionaries(config, io, common_vars) 33 requests += generate_normalization(config, io, common_vars) 34 requests += generate_coll_ucadata(config, io, common_vars) 35 requests += generate_full_unicore_data(config, io, common_vars) 36 requests += generate_unames(config, io, common_vars) 37 requests += generate_misc(config, io, common_vars) 38 requests += generate_curr_supplemental(config, io, common_vars) 39 requests += generate_zone_supplemental(config, io, common_vars) 40 requests += generate_translit(config, io, common_vars) 41 42 # Res Tree Files 43 # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files) 44 requests += generate_tree(config, io, common_vars, 45 "locales", 46 None, 47 config.use_pool_bundle, 48 []) 49 50 requests += generate_tree(config, io, common_vars, 51 "curr", 52 "curr", 53 config.use_pool_bundle, 54 []) 55 56 requests += generate_tree(config, io, common_vars, 57 "lang", 58 "lang", 59 config.use_pool_bundle, 60 []) 61 62 requests += generate_tree(config, io, common_vars, 63 "region", 64 "region", 65 config.use_pool_bundle, 66 []) 67 68 requests += generate_tree(config, io, common_vars, 69 "zone", 70 "zone", 71 config.use_pool_bundle, 72 []) 73 74 requests += generate_tree(config, io, common_vars, 75 "unit", 76 "unit", 77 config.use_pool_bundle, 78 []) 79 80 requests += generate_tree(config, io, common_vars, 81 "coll", 82 "coll", 83 # Never use pool bundle for coll, brkitr, or rbnf 84 False, 85 # Depends on timezoneTypes.res and keyTypeData.res. 86 # TODO: We should not need this dependency to build collation. 87 # TODO: Bake keyTypeData.res into the common library? 88 [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")]) 89 90 requests += generate_tree(config, io, common_vars, 91 "brkitr", 92 "brkitr", 93 # Never use pool bundle for coll, brkitr, or rbnf 94 False, 95 [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) 96 97 requests += generate_tree(config, io, common_vars, 98 "rbnf", 99 "rbnf", 100 # Never use pool bundle for coll, brkitr, or rbnf 101 False, 102 []) 103 104 requests += [ 105 ListRequest( 106 name = "icudata_list", 107 variable_name = "icudata_all_output_files", 108 output_file = TmpFile("icudata.lst"), 109 include_tmp = False 110 ) 111 ] 112 113 return requests 114 115 116def generate_cnvalias(config, io, common_vars): 117 # UConv Name Aliases 118 input_file = InFile("mappings/convrtrs.txt") 119 output_file = OutFile("cnvalias.icu") 120 return [ 121 SingleExecutionRequest( 122 name = "cnvalias", 123 category = "cnvalias", 124 dep_targets = [], 125 input_files = [input_file], 126 output_files = [output_file], 127 tool = IcuTool("gencnval"), 128 args = "-s {IN_DIR} -d {OUT_DIR} " 129 "{INPUT_FILES[0]}", 130 format_with = {} 131 ) 132 ] 133 134 135def generate_confusables(config, io, common_vars): 136 # CONFUSABLES 137 txt1 = InFile("unidata/confusables.txt") 138 txt2 = InFile("unidata/confusablesWholeScript.txt") 139 cfu = OutFile("confusables.cfu") 140 return [ 141 SingleExecutionRequest( 142 name = "confusables", 143 category = "confusables", 144 dep_targets = [DepTarget("cnvalias")], 145 input_files = [txt1, txt2], 146 output_files = [cfu], 147 tool = IcuTool("gencfu"), 148 args = "-d {OUT_DIR} -i {OUT_DIR} " 149 "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} " 150 "-o {OUTPUT_FILES[0]}", 151 format_with = {} 152 ) 153 ] 154 155 156def generate_conversion_mappings(config, io, common_vars): 157 # UConv Conversion Table Files 158 input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")] 159 output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files] 160 # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv 161 return [ 162 RepeatedOrSingleExecutionRequest( 163 name = "conversion_mappings", 164 category = "conversion_mappings", 165 dep_targets = [], 166 input_files = input_files, 167 output_files = output_files, 168 tool = IcuTool("makeconv"), 169 args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}", 170 format_with = {}, 171 repeat_with = { 172 "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files) 173 } 174 ) 175 ] 176 177 178def generate_brkitr_brk(config, io, common_vars): 179 # BRK Files 180 input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")] 181 output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files] 182 return [ 183 RepeatedExecutionRequest( 184 name = "brkitr_brk", 185 category = "brkitr_rules", 186 dep_targets = 187 [DepTarget("cnvalias"), 188 DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")], 189 input_files = input_files, 190 output_files = output_files, 191 tool = IcuTool("genbrk"), 192 args = "-d {OUT_DIR} -i {OUT_DIR} " 193 "-c -r {IN_DIR}/{INPUT_FILE} " 194 "-o {OUTPUT_FILE}", 195 format_with = {}, 196 repeat_with = {} 197 ) 198 ] 199 200 201def generate_stringprep(config, io, common_vars): 202 # SPP FILES 203 input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")] 204 output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files] 205 bundle_names = [v.filename[6:-4] for v in input_files] 206 return [ 207 RepeatedExecutionRequest( 208 name = "stringprep", 209 category = "stringprep", 210 dep_targets = [InFile("unidata/NormalizationCorrections.txt")], 211 input_files = input_files, 212 output_files = output_files, 213 tool = IcuTool("gensprep"), 214 args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} " 215 "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt", 216 format_with = {}, 217 repeat_with = { 218 "BUNDLE_NAME": bundle_names 219 } 220 ) 221 ] 222 223 224def generate_brkitr_dictionaries(config, io, common_vars): 225 # Dict Files 226 input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")] 227 output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files] 228 extra_options_map = { 229 "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000", 230 "brkitr/dictionaries/cjdict.txt": "--uchars", 231 "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780", 232 "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80", 233 "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00" 234 } 235 extra_optionses = [extra_options_map[v.filename] for v in input_files] 236 return [ 237 RepeatedExecutionRequest( 238 name = "dictionaries", 239 category = "brkitr_dictionaries", 240 dep_targets = [], 241 input_files = input_files, 242 output_files = output_files, 243 tool = IcuTool("gendict"), 244 args = "-i {OUT_DIR} " 245 "-c {EXTRA_OPTIONS} " 246 "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 247 format_with = {}, 248 repeat_with = { 249 "EXTRA_OPTIONS": extra_optionses 250 } 251 ) 252 ] 253 254 255def generate_normalization(config, io, common_vars): 256 # NRM Files 257 input_files = [InFile(filename) for filename in io.glob("in/*.nrm")] 258 # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data 259 input_files.remove(InFile("in/nfc.nrm")) 260 output_files = [OutFile(v.filename[3:]) for v in input_files] 261 return [ 262 RepeatedExecutionRequest( 263 name = "normalization", 264 category = "normalization", 265 dep_targets = [], 266 input_files = input_files, 267 output_files = output_files, 268 tool = IcuTool("icupkg"), 269 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 270 format_with = {}, 271 repeat_with = {} 272 ) 273 ] 274 275 276def generate_coll_ucadata(config, io, common_vars): 277 # Collation Dependency File (ucadata.icu) 278 input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type) 279 output_file = OutFile("coll/ucadata.icu") 280 return [ 281 SingleExecutionRequest( 282 name = "coll_ucadata", 283 category = "coll_ucadata", 284 dep_targets = [], 285 input_files = [input_file], 286 output_files = [output_file], 287 tool = IcuTool("icupkg"), 288 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 289 format_with = {} 290 ) 291 ] 292 293 294def generate_full_unicore_data(config, io, common_vars): 295 # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu) 296 # are hardcoded in the common DLL and therefore not included in the data package any more. 297 # They are not built by default but need to be built for ICU4J data, 298 # both in the .jar and in the .dat file (if ICU4J uses the .dat file). 299 # See ICU-4497. 300 if not config.include_uni_core_data: 301 return [] 302 303 basenames = [ 304 "pnames.icu", 305 "uprops.icu", 306 "ucase.icu", 307 "ubidi.icu", 308 "nfc.nrm" 309 ] 310 input_files = [InFile("in/%s" % bn) for bn in basenames] 311 output_files = [OutFile(bn) for bn in basenames] 312 return [ 313 RepeatedExecutionRequest( 314 name = "unicore", 315 category = "unicore", 316 input_files = input_files, 317 output_files = output_files, 318 tool = IcuTool("icupkg"), 319 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}" 320 ) 321 ] 322 323 324def generate_unames(config, io, common_vars): 325 # Unicode Character Names 326 input_file = InFile("in/unames.icu") 327 output_file = OutFile("unames.icu") 328 return [ 329 SingleExecutionRequest( 330 name = "unames", 331 category = "unames", 332 dep_targets = [], 333 input_files = [input_file], 334 output_files = [output_file], 335 tool = IcuTool("icupkg"), 336 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 337 format_with = {} 338 ) 339 ] 340 341 342def generate_ulayout(config, io, common_vars): 343 # Unicode text layout properties 344 basename = "ulayout" 345 input_file = InFile("in/%s.icu" % basename) 346 output_file = OutFile("%s.icu" % basename) 347 return [ 348 SingleExecutionRequest( 349 name = basename, 350 category = basename, 351 dep_targets = [], 352 input_files = [input_file], 353 output_files = [output_file], 354 tool = IcuTool("icupkg"), 355 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 356 format_with = {} 357 ) 358 ] 359 360 361def generate_uemoji(config, io, common_vars): 362 # Unicode emoji properties 363 basename = "uemoji" 364 input_file = InFile("in/%s.icu" % basename) 365 output_file = OutFile("%s.icu" % basename) 366 return [ 367 SingleExecutionRequest( 368 name = basename, 369 category = basename, 370 dep_targets = [], 371 input_files = [input_file], 372 output_files = [output_file], 373 tool = IcuTool("icupkg"), 374 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 375 format_with = {} 376 ) 377 ] 378 379 380def generate_misc(config, io, common_vars): 381 # Misc Data Res Files 382 input_files = [InFile(filename) for filename in io.glob("misc/*.txt")] 383 input_basenames = [v.filename[5:] for v in input_files] 384 output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames] 385 return [ 386 RepeatedExecutionRequest( 387 name = "misc_res", 388 category = "misc", 389 dep_targets = [DepTarget("cnvalias")], # ICU-21175 390 input_files = input_files, 391 output_files = output_files, 392 tool = IcuTool("genrb"), 393 args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} " 394 "-k -q " 395 "{INPUT_BASENAME}", 396 format_with = {}, 397 repeat_with = { 398 "INPUT_BASENAME": input_basenames 399 } 400 ) 401 ] 402 403 404def generate_curr_supplemental(config, io, common_vars): 405 # Currency Supplemental Res File 406 input_file = InFile("curr/supplementalData.txt") 407 input_basename = "supplementalData.txt" 408 output_file = OutFile("curr/supplementalData.res") 409 return [ 410 SingleExecutionRequest( 411 name = "curr_supplemental_res", 412 category = "curr_supplemental", 413 dep_targets = [], 414 input_files = [input_file], 415 output_files = [output_file], 416 tool = IcuTool("genrb"), 417 args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} " 418 "-k " 419 "{INPUT_BASENAME}", 420 format_with = { 421 "INPUT_BASENAME": input_basename 422 } 423 ) 424 ] 425 426 427def generate_zone_supplemental(config, io, common_vars): 428 # tzdbNames Res File 429 input_file = InFile("zone/tzdbNames.txt") 430 input_basename = "tzdbNames.txt" 431 output_file = OutFile("zone/tzdbNames.res") 432 return [ 433 SingleExecutionRequest( 434 name = "zone_supplemental_res", 435 category = "zone_supplemental", 436 dep_targets = [], 437 input_files = [input_file], 438 output_files = [output_file], 439 tool = IcuTool("genrb"), 440 args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} " 441 "-k " 442 "{INPUT_BASENAME}", 443 format_with = { 444 "INPUT_BASENAME": input_basename 445 } 446 ) 447 ] 448 449 450def generate_translit(config, io, common_vars): 451 input_files = [ 452 InFile("translit/root.txt"), 453 InFile("translit/en.txt"), 454 InFile("translit/el.txt") 455 ] 456 dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt")) 457 dep_files -= set(input_files) 458 dep_files = list(sorted(dep_files)) 459 input_basenames = [v.filename[9:] for v in input_files] 460 output_files = [ 461 OutFile("translit/%s.res" % v[:-4]) 462 for v in input_basenames 463 ] 464 return [ 465 RepeatedOrSingleExecutionRequest( 466 name = "translit_res", 467 category = "translit", 468 dep_targets = dep_files, 469 input_files = input_files, 470 output_files = output_files, 471 tool = IcuTool("genrb"), 472 args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} " 473 "-k " 474 "{INPUT_BASENAME}", 475 format_with = { 476 }, 477 repeat_with = { 478 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 479 } 480 ) 481 ] 482 483 484def generate_brkitr_lstm(config, io, common_vars): 485 input_files = [InFile(filename) for filename in io.glob("brkitr/lstm/*.txt")] 486 input_basenames = [v.filename[12:] for v in input_files] 487 output_files = [ 488 OutFile("brkitr/%s.res" % v[:-4]) 489 for v in input_basenames 490 ] 491 return [ 492 RepeatedOrSingleExecutionRequest( 493 name = "lstm_res", 494 category = "brkitr_lstm", 495 dep_targets = [], 496 input_files = input_files, 497 output_files = output_files, 498 tool = IcuTool("genrb"), 499 args = "-s {IN_DIR}/brkitr/lstm -d {OUT_DIR}/brkitr -i {OUT_DIR} " 500 "-k " 501 "{INPUT_BASENAME}", 502 format_with = { 503 }, 504 repeat_with = { 505 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 506 } 507 ) 508 ] 509 510def generate_brkitr_adaboost(config, io, common_vars): 511 input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")] 512 input_basenames = [v.filename[16:] for v in input_files] 513 output_files = [ 514 OutFile("brkitr/%s.res" % v[:-4]) 515 for v in input_basenames 516 ] 517 return [ 518 RepeatedOrSingleExecutionRequest( 519 name = "adaboost_res", 520 category = "brkitr_adaboost", 521 dep_targets = [], 522 input_files = input_files, 523 output_files = output_files, 524 tool = IcuTool("genrb"), 525 args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} " 526 "-k " 527 "{INPUT_BASENAME}", 528 format_with = { 529 }, 530 repeat_with = { 531 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 532 } 533 ) 534 ] 535 536def generate_tree( 537 config, 538 io, 539 common_vars, 540 sub_dir, 541 out_sub_dir, 542 use_pool_bundle, 543 dep_targets): 544 requests = [] 545 category = "%s_tree" % sub_dir 546 out_prefix = "%s/" % out_sub_dir if out_sub_dir else "" 547 input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)] 548 if sub_dir == "curr": 549 input_files.remove(InFile("curr/supplementalData.txt")) 550 if sub_dir == "zone": 551 input_files.remove(InFile("zone/tzdbNames.txt")) 552 input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files] 553 output_files = [ 554 OutFile("%s%s.res" % (out_prefix, v[:-4])) 555 for v in input_basenames 556 ] 557 558 # Generate Pool Bundle 559 if use_pool_bundle: 560 input_pool_files = [OutFile("%spool.res" % out_prefix)] 561 pool_target_name = "%s_pool_write" % sub_dir 562 use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format( 563 OUT_PREFIX = out_prefix, 564 **common_vars 565 ) 566 requests += [ 567 SingleExecutionRequest( 568 name = pool_target_name, 569 category = category, 570 dep_targets = dep_targets, 571 input_files = input_files, 572 output_files = input_pool_files, 573 tool = IcuTool("genrb"), 574 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 575 "--writePoolBundle -k " 576 "{INPUT_BASENAMES_SPACED}", 577 format_with = { 578 "IN_SUB_DIR": sub_dir, 579 "OUT_PREFIX": out_prefix, 580 "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames) 581 } 582 ), 583 ] 584 dep_targets = dep_targets + [DepTarget(pool_target_name)] 585 else: 586 use_pool_bundle_option = "" 587 588 # Generate Res File Tree 589 requests += [ 590 RepeatedOrSingleExecutionRequest( 591 name = "%s_res" % sub_dir, 592 category = category, 593 dep_targets = dep_targets, 594 input_files = input_files, 595 output_files = output_files, 596 tool = IcuTool("genrb"), 597 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 598 "{EXTRA_OPTION} -k " 599 "{INPUT_BASENAME}", 600 format_with = { 601 "IN_SUB_DIR": sub_dir, 602 "OUT_PREFIX": out_prefix, 603 "EXTRA_OPTION": use_pool_bundle_option 604 }, 605 repeat_with = { 606 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 607 } 608 ) 609 ] 610 611 # Generate res_index file 612 # Exclude the deprecated locale variants and root; see ICU-20628. This 613 # could be data-driven, but we do not want to perform I/O in this script 614 # (for example, we do not want to read from an XML file). 615 excluded_locales = set([ 616 "ja_JP_TRADITIONAL", 617 "th_TH_TRADITIONAL", 618 "de_", 619 "de__PHONEBOOK", 620 "es_", 621 "es__TRADITIONAL", 622 "root", 623 ]) 624 # Put alias locales in a separate structure; see ICU-20627 625 dependency_data = io.read_locale_deps(sub_dir) 626 if "aliases" in dependency_data: 627 alias_locales = set(dependency_data["aliases"].keys()) 628 else: 629 alias_locales = set() 630 alias_files = [] 631 installed_files = [] 632 for f in input_files: 633 file_stem = IndexRequest.locale_file_stem(f) 634 if file_stem in excluded_locales: 635 continue 636 destination = alias_files if file_stem in alias_locales else installed_files 637 destination.append(f) 638 cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None 639 index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format( 640 IN_SUB_DIR = sub_dir, 641 **common_vars 642 )) 643 index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format( 644 OUT_PREFIX = out_prefix, 645 **common_vars 646 )) 647 index_file_target_name = "%s_index_txt" % sub_dir 648 requests += [ 649 IndexRequest( 650 name = index_file_target_name, 651 category = category, 652 installed_files = installed_files, 653 alias_files = alias_files, 654 txt_file = index_file_txt, 655 output_file = index_res_file, 656 cldr_version = cldr_version, 657 args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 658 "-k " 659 "{INDEX_NAME}.txt", 660 format_with = { 661 "IN_SUB_DIR": sub_dir, 662 "OUT_PREFIX": out_prefix 663 } 664 ) 665 ] 666 667 return requests 668