1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from icutools.databuilder import * 9from icutools.databuilder import utils 10from icutools.databuilder.request_types import * 11 12import os 13import sys 14 15 16def generate(config, io, common_vars): 17 requests = [] 18 19 if len(io.glob("misc/*")) == 0: 20 print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr) 21 exit(1) 22 23 requests += generate_cnvalias(config, io, common_vars) 24 requests += generate_ulayout(config, io, common_vars) 25 requests += generate_uemoji(config, io, common_vars) 26 requests += generate_confusables(config, io, common_vars) 27 requests += generate_conversion_mappings(config, io, common_vars) 28 requests += generate_brkitr_brk(config, io, common_vars) 29 requests += generate_brkitr_lstm(config, io, common_vars) 30 requests += generate_stringprep(config, io, common_vars) 31 requests += generate_brkitr_dictionaries(config, io, common_vars) 32 requests += generate_normalization(config, io, common_vars) 33 requests += generate_coll_ucadata(config, io, common_vars) 34 requests += generate_full_unicore_data(config, io, common_vars) 35 requests += generate_unames(config, io, common_vars) 36 requests += generate_misc(config, io, common_vars) 37 requests += generate_curr_supplemental(config, io, common_vars) 38 requests += generate_zone_supplemental(config, io, common_vars) 39 requests += generate_translit(config, io, common_vars) 40 41 # Res Tree Files 42 # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files) 43 requests += generate_tree(config, io, common_vars, 44 "locales", 45 None, 46 config.use_pool_bundle, 47 []) 48 49 requests += generate_tree(config, io, common_vars, 50 "curr", 51 "curr", 52 config.use_pool_bundle, 53 []) 54 55 requests += generate_tree(config, io, common_vars, 56 "lang", 57 "lang", 58 config.use_pool_bundle, 59 []) 60 61 requests += generate_tree(config, io, common_vars, 62 "region", 63 "region", 64 config.use_pool_bundle, 65 []) 66 67 requests += generate_tree(config, io, common_vars, 68 "zone", 69 "zone", 70 config.use_pool_bundle, 71 []) 72 73 requests += generate_tree(config, io, common_vars, 74 "unit", 75 "unit", 76 config.use_pool_bundle, 77 []) 78 79 requests += generate_tree(config, io, common_vars, 80 "coll", 81 "coll", 82 # Never use pool bundle for coll, brkitr, or rbnf 83 False, 84 # Depends on timezoneTypes.res and keyTypeData.res. 85 # TODO: We should not need this dependency to build collation. 86 # TODO: Bake keyTypeData.res into the common library? 87 [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")]) 88 89 requests += generate_tree(config, io, common_vars, 90 "brkitr", 91 "brkitr", 92 # Never use pool bundle for coll, brkitr, or rbnf 93 False, 94 [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) 95 96 requests += generate_tree(config, io, common_vars, 97 "rbnf", 98 "rbnf", 99 # Never use pool bundle for coll, brkitr, or rbnf 100 False, 101 []) 102 103 requests += [ 104 ListRequest( 105 name = "icudata_list", 106 variable_name = "icudata_all_output_files", 107 output_file = TmpFile("icudata.lst"), 108 include_tmp = False 109 ) 110 ] 111 112 return requests 113 114 115def generate_cnvalias(config, io, common_vars): 116 # UConv Name Aliases 117 input_file = InFile("mappings/convrtrs.txt") 118 output_file = OutFile("cnvalias.icu") 119 return [ 120 SingleExecutionRequest( 121 name = "cnvalias", 122 category = "cnvalias", 123 dep_targets = [], 124 input_files = [input_file], 125 output_files = [output_file], 126 tool = IcuTool("gencnval"), 127 args = "-s {IN_DIR} -d {OUT_DIR} " 128 "{INPUT_FILES[0]}", 129 format_with = {} 130 ) 131 ] 132 133 134def generate_confusables(config, io, common_vars): 135 # CONFUSABLES 136 txt1 = InFile("unidata/confusables.txt") 137 txt2 = InFile("unidata/confusablesWholeScript.txt") 138 cfu = OutFile("confusables.cfu") 139 return [ 140 SingleExecutionRequest( 141 name = "confusables", 142 category = "confusables", 143 dep_targets = [DepTarget("cnvalias")], 144 input_files = [txt1, txt2], 145 output_files = [cfu], 146 tool = IcuTool("gencfu"), 147 args = "-d {OUT_DIR} -i {OUT_DIR} " 148 "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} " 149 "-o {OUTPUT_FILES[0]}", 150 format_with = {} 151 ) 152 ] 153 154 155def generate_conversion_mappings(config, io, common_vars): 156 # UConv Conversion Table Files 157 input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")] 158 output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files] 159 # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv 160 return [ 161 RepeatedOrSingleExecutionRequest( 162 name = "conversion_mappings", 163 category = "conversion_mappings", 164 dep_targets = [], 165 input_files = input_files, 166 output_files = output_files, 167 tool = IcuTool("makeconv"), 168 args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}", 169 format_with = {}, 170 repeat_with = { 171 "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files) 172 } 173 ) 174 ] 175 176 177def generate_brkitr_brk(config, io, common_vars): 178 # BRK Files 179 input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")] 180 output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files] 181 return [ 182 RepeatedExecutionRequest( 183 name = "brkitr_brk", 184 category = "brkitr_rules", 185 dep_targets = 186 [DepTarget("cnvalias"), 187 DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")], 188 input_files = input_files, 189 output_files = output_files, 190 tool = IcuTool("genbrk"), 191 args = "-d {OUT_DIR} -i {OUT_DIR} " 192 "-c -r {IN_DIR}/{INPUT_FILE} " 193 "-o {OUTPUT_FILE}", 194 format_with = {}, 195 repeat_with = {} 196 ) 197 ] 198 199 200def generate_stringprep(config, io, common_vars): 201 # SPP FILES 202 input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")] 203 output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files] 204 bundle_names = [v.filename[6:-4] for v in input_files] 205 return [ 206 RepeatedExecutionRequest( 207 name = "stringprep", 208 category = "stringprep", 209 dep_targets = [InFile("unidata/NormalizationCorrections.txt")], 210 input_files = input_files, 211 output_files = output_files, 212 tool = IcuTool("gensprep"), 213 args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} " 214 "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt", 215 format_with = {}, 216 repeat_with = { 217 "BUNDLE_NAME": bundle_names 218 } 219 ) 220 ] 221 222 223def generate_brkitr_dictionaries(config, io, common_vars): 224 # Dict Files 225 input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")] 226 output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files] 227 extra_options_map = { 228 "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000", 229 "brkitr/dictionaries/cjdict.txt": "--uchars", 230 "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780", 231 "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80", 232 "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00" 233 } 234 extra_optionses = [extra_options_map[v.filename] for v in input_files] 235 return [ 236 RepeatedExecutionRequest( 237 name = "dictionaries", 238 category = "brkitr_dictionaries", 239 dep_targets = [], 240 input_files = input_files, 241 output_files = output_files, 242 tool = IcuTool("gendict"), 243 args = "-i {OUT_DIR} " 244 "-c {EXTRA_OPTIONS} " 245 "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 246 format_with = {}, 247 repeat_with = { 248 "EXTRA_OPTIONS": extra_optionses 249 } 250 ) 251 ] 252 253 254def generate_normalization(config, io, common_vars): 255 # NRM Files 256 input_files = [InFile(filename) for filename in io.glob("in/*.nrm")] 257 # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data 258 input_files.remove(InFile("in/nfc.nrm")) 259 output_files = [OutFile(v.filename[3:]) for v in input_files] 260 return [ 261 RepeatedExecutionRequest( 262 name = "normalization", 263 category = "normalization", 264 dep_targets = [], 265 input_files = input_files, 266 output_files = output_files, 267 tool = IcuTool("icupkg"), 268 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 269 format_with = {}, 270 repeat_with = {} 271 ) 272 ] 273 274 275def generate_coll_ucadata(config, io, common_vars): 276 # Collation Dependency File (ucadata.icu) 277 input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type) 278 output_file = OutFile("coll/ucadata.icu") 279 return [ 280 SingleExecutionRequest( 281 name = "coll_ucadata", 282 category = "coll_ucadata", 283 dep_targets = [], 284 input_files = [input_file], 285 output_files = [output_file], 286 tool = IcuTool("icupkg"), 287 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 288 format_with = {} 289 ) 290 ] 291 292 293def generate_full_unicore_data(config, io, common_vars): 294 # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu) 295 # are hardcoded in the common DLL and therefore not included in the data package any more. 296 # They are not built by default but need to be built for ICU4J data, 297 # both in the .jar and in the .dat file (if ICU4J uses the .dat file). 298 # See ICU-4497. 299 if not config.include_uni_core_data: 300 return [] 301 302 basenames = [ 303 "pnames.icu", 304 "uprops.icu", 305 "ucase.icu", 306 "ubidi.icu", 307 "nfc.nrm" 308 ] 309 input_files = [InFile("in/%s" % bn) for bn in basenames] 310 output_files = [OutFile(bn) for bn in basenames] 311 return [ 312 RepeatedExecutionRequest( 313 name = "unicore", 314 category = "unicore", 315 input_files = input_files, 316 output_files = output_files, 317 tool = IcuTool("icupkg"), 318 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}" 319 ) 320 ] 321 322 323def generate_unames(config, io, common_vars): 324 # Unicode Character Names 325 input_file = InFile("in/unames.icu") 326 output_file = OutFile("unames.icu") 327 return [ 328 SingleExecutionRequest( 329 name = "unames", 330 category = "unames", 331 dep_targets = [], 332 input_files = [input_file], 333 output_files = [output_file], 334 tool = IcuTool("icupkg"), 335 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 336 format_with = {} 337 ) 338 ] 339 340 341def generate_ulayout(config, io, common_vars): 342 # Unicode text layout properties 343 basename = "ulayout" 344 input_file = InFile("in/%s.icu" % basename) 345 output_file = OutFile("%s.icu" % basename) 346 return [ 347 SingleExecutionRequest( 348 name = basename, 349 category = basename, 350 dep_targets = [], 351 input_files = [input_file], 352 output_files = [output_file], 353 tool = IcuTool("icupkg"), 354 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 355 format_with = {} 356 ) 357 ] 358 359 360def generate_uemoji(config, io, common_vars): 361 # Unicode emoji properties 362 basename = "uemoji" 363 input_file = InFile("in/%s.icu" % basename) 364 output_file = OutFile("%s.icu" % basename) 365 return [ 366 SingleExecutionRequest( 367 name = basename, 368 category = basename, 369 dep_targets = [], 370 input_files = [input_file], 371 output_files = [output_file], 372 tool = IcuTool("icupkg"), 373 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 374 format_with = {} 375 ) 376 ] 377 378 379def generate_misc(config, io, common_vars): 380 # Misc Data Res Files 381 input_files = [InFile(filename) for filename in io.glob("misc/*.txt")] 382 input_basenames = [v.filename[5:] for v in input_files] 383 output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames] 384 return [ 385 RepeatedExecutionRequest( 386 name = "misc_res", 387 category = "misc", 388 dep_targets = [DepTarget("cnvalias")], # ICU-21175 389 input_files = input_files, 390 output_files = output_files, 391 tool = IcuTool("genrb"), 392 args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} " 393 "-k -q " 394 "{INPUT_BASENAME}", 395 format_with = {}, 396 repeat_with = { 397 "INPUT_BASENAME": input_basenames 398 } 399 ) 400 ] 401 402 403def generate_curr_supplemental(config, io, common_vars): 404 # Currency Supplemental Res File 405 input_file = InFile("curr/supplementalData.txt") 406 input_basename = "supplementalData.txt" 407 output_file = OutFile("curr/supplementalData.res") 408 return [ 409 SingleExecutionRequest( 410 name = "curr_supplemental_res", 411 category = "curr_supplemental", 412 dep_targets = [], 413 input_files = [input_file], 414 output_files = [output_file], 415 tool = IcuTool("genrb"), 416 args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} " 417 "-k " 418 "{INPUT_BASENAME}", 419 format_with = { 420 "INPUT_BASENAME": input_basename 421 } 422 ) 423 ] 424 425 426def generate_zone_supplemental(config, io, common_vars): 427 # tzdbNames Res File 428 input_file = InFile("zone/tzdbNames.txt") 429 input_basename = "tzdbNames.txt" 430 output_file = OutFile("zone/tzdbNames.res") 431 return [ 432 SingleExecutionRequest( 433 name = "zone_supplemental_res", 434 category = "zone_supplemental", 435 dep_targets = [], 436 input_files = [input_file], 437 output_files = [output_file], 438 tool = IcuTool("genrb"), 439 args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} " 440 "-k " 441 "{INPUT_BASENAME}", 442 format_with = { 443 "INPUT_BASENAME": input_basename 444 } 445 ) 446 ] 447 448 449def generate_translit(config, io, common_vars): 450 input_files = [ 451 InFile("translit/root.txt"), 452 InFile("translit/en.txt"), 453 InFile("translit/el.txt") 454 ] 455 dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt")) 456 dep_files -= set(input_files) 457 dep_files = list(sorted(dep_files)) 458 input_basenames = [v.filename[9:] for v in input_files] 459 output_files = [ 460 OutFile("translit/%s.res" % v[:-4]) 461 for v in input_basenames 462 ] 463 return [ 464 RepeatedOrSingleExecutionRequest( 465 name = "translit_res", 466 category = "translit", 467 dep_targets = dep_files, 468 input_files = input_files, 469 output_files = output_files, 470 tool = IcuTool("genrb"), 471 args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} " 472 "-k " 473 "{INPUT_BASENAME}", 474 format_with = { 475 }, 476 repeat_with = { 477 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 478 } 479 ) 480 ] 481 482 483def generate_brkitr_lstm(config, io, common_vars): 484 input_files = [InFile(filename) for filename in io.glob("brkitr/lstm/*.txt")] 485 input_basenames = [v.filename[12:] for v in input_files] 486 output_files = [ 487 OutFile("brkitr/%s.res" % v[:-4]) 488 for v in input_basenames 489 ] 490 return [ 491 RepeatedOrSingleExecutionRequest( 492 name = "lstm_res", 493 category = "brkitr_lstm", 494 dep_targets = [], 495 input_files = input_files, 496 output_files = output_files, 497 tool = IcuTool("genrb"), 498 args = "-s {IN_DIR}/brkitr/lstm -d {OUT_DIR}/brkitr -i {OUT_DIR} " 499 "-k " 500 "{INPUT_BASENAME}", 501 format_with = { 502 }, 503 repeat_with = { 504 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 505 } 506 ) 507 ] 508 509def generate_tree( 510 config, 511 io, 512 common_vars, 513 sub_dir, 514 out_sub_dir, 515 use_pool_bundle, 516 dep_targets): 517 requests = [] 518 category = "%s_tree" % sub_dir 519 out_prefix = "%s/" % out_sub_dir if out_sub_dir else "" 520 input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)] 521 if sub_dir == "curr": 522 input_files.remove(InFile("curr/supplementalData.txt")) 523 if sub_dir == "zone": 524 input_files.remove(InFile("zone/tzdbNames.txt")) 525 input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files] 526 output_files = [ 527 OutFile("%s%s.res" % (out_prefix, v[:-4])) 528 for v in input_basenames 529 ] 530 531 # Generate Pool Bundle 532 if use_pool_bundle: 533 input_pool_files = [OutFile("%spool.res" % out_prefix)] 534 pool_target_name = "%s_pool_write" % sub_dir 535 use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format( 536 OUT_PREFIX = out_prefix, 537 **common_vars 538 ) 539 requests += [ 540 SingleExecutionRequest( 541 name = pool_target_name, 542 category = category, 543 dep_targets = dep_targets, 544 input_files = input_files, 545 output_files = input_pool_files, 546 tool = IcuTool("genrb"), 547 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 548 "--writePoolBundle -k " 549 "{INPUT_BASENAMES_SPACED}", 550 format_with = { 551 "IN_SUB_DIR": sub_dir, 552 "OUT_PREFIX": out_prefix, 553 "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames) 554 } 555 ), 556 ] 557 dep_targets = dep_targets + [DepTarget(pool_target_name)] 558 else: 559 use_pool_bundle_option = "" 560 561 # Generate Res File Tree 562 requests += [ 563 RepeatedOrSingleExecutionRequest( 564 name = "%s_res" % sub_dir, 565 category = category, 566 dep_targets = dep_targets, 567 input_files = input_files, 568 output_files = output_files, 569 tool = IcuTool("genrb"), 570 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 571 "{EXTRA_OPTION} -k " 572 "{INPUT_BASENAME}", 573 format_with = { 574 "IN_SUB_DIR": sub_dir, 575 "OUT_PREFIX": out_prefix, 576 "EXTRA_OPTION": use_pool_bundle_option 577 }, 578 repeat_with = { 579 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 580 } 581 ) 582 ] 583 584 # Generate res_index file 585 # Exclude the deprecated locale variants and root; see ICU-20628. This 586 # could be data-driven, but we do not want to perform I/O in this script 587 # (for example, we do not want to read from an XML file). 588 excluded_locales = set([ 589 "ja_JP_TRADITIONAL", 590 "th_TH_TRADITIONAL", 591 "de_", 592 "de__PHONEBOOK", 593 "es_", 594 "es__TRADITIONAL", 595 "root", 596 ]) 597 # Put alias locales in a separate structure; see ICU-20627 598 dependency_data = io.read_locale_deps(sub_dir) 599 if "aliases" in dependency_data: 600 alias_locales = set(dependency_data["aliases"].keys()) 601 else: 602 alias_locales = set() 603 alias_files = [] 604 installed_files = [] 605 for f in input_files: 606 file_stem = IndexRequest.locale_file_stem(f) 607 if file_stem in excluded_locales: 608 continue 609 destination = alias_files if file_stem in alias_locales else installed_files 610 destination.append(f) 611 cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None 612 index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format( 613 IN_SUB_DIR = sub_dir, 614 **common_vars 615 )) 616 index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format( 617 OUT_PREFIX = out_prefix, 618 **common_vars 619 )) 620 index_file_target_name = "%s_index_txt" % sub_dir 621 requests += [ 622 IndexRequest( 623 name = index_file_target_name, 624 category = category, 625 installed_files = installed_files, 626 alias_files = alias_files, 627 txt_file = index_file_txt, 628 output_file = index_res_file, 629 cldr_version = cldr_version, 630 args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 631 "-k " 632 "{INDEX_NAME}.txt", 633 format_with = { 634 "IN_SUB_DIR": sub_dir, 635 "OUT_PREFIX": out_prefix 636 } 637 ) 638 ] 639 640 return requests 641