1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from icutools.databuilder import * 9from icutools.databuilder import utils 10from icutools.databuilder.request_types import * 11 12import os 13import sys 14 15 16def generate(config, io, common_vars): 17 requests = [] 18 19 if len(io.glob("misc/*")) == 0: 20 print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr) 21 exit(1) 22 23 requests += generate_cnvalias(config, io, common_vars) 24 requests += generate_ulayout(config, io, common_vars) 25 requests += generate_uemoji(config, io, common_vars) 26 requests += generate_confusables(config, io, common_vars) 27 requests += generate_conversion_mappings(config, io, common_vars) 28 requests += generate_brkitr_brk(config, io, common_vars) 29 requests += generate_brkitr_lstm(config, io, common_vars) 30 requests += generate_stringprep(config, io, common_vars) 31 requests += generate_brkitr_dictionaries(config, io, common_vars) 32 requests += generate_normalization(config, io, common_vars) 33 requests += generate_coll_ucadata(config, io, common_vars) 34 requests += generate_full_unicore_data(config, io, common_vars) 35 requests += generate_unames(config, io, common_vars) 36 requests += generate_misc(config, io, common_vars) 37 requests += generate_curr_supplemental(config, io, common_vars) 38 requests += generate_zone_supplemental(config, io, common_vars) 39 requests += generate_translit(config, io, common_vars) 40 41 # Res Tree Files 42 # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files) 43 requests += generate_tree(config, io, common_vars, 44 "locales", 45 None, 46 config.use_pool_bundle, 47 []) 48 49 requests += generate_tree(config, io, common_vars, 50 "curr", 51 "curr", 52 config.use_pool_bundle, 53 []) 54 55 requests += generate_tree(config, io, common_vars, 56 "lang", 57 "lang", 58 config.use_pool_bundle, 59 []) 60 61 requests += generate_tree(config, io, common_vars, 62 "region", 63 "region", 64 config.use_pool_bundle, 65 []) 66 67 requests += generate_tree(config, io, common_vars, 68 "zone", 69 "zone", 70 config.use_pool_bundle, 71 []) 72 73 requests += generate_tree(config, io, common_vars, 74 "unit", 75 "unit", 76 config.use_pool_bundle, 77 []) 78 79 requests += generate_tree(config, io, common_vars, 80 "coll", 81 "coll", 82 # Never use pool bundle for coll, brkitr, or rbnf 83 False, 84 # Depends on timezoneTypes.res and keyTypeData.res. 85 # TODO: We should not need this dependency to build collation. 86 # TODO: Bake keyTypeData.res into the common library? 87 [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")]) 88 89 requests += generate_tree(config, io, common_vars, 90 "brkitr", 91 "brkitr", 92 # Never use pool bundle for coll, brkitr, or rbnf 93 False, 94 [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) 95 96 requests += generate_tree(config, io, common_vars, 97 "rbnf", 98 "rbnf", 99 # Never use pool bundle for coll, brkitr, or rbnf 100 False, 101 []) 102 103 requests += [ 104 ListRequest( 105 name = "icudata_list", 106 variable_name = "icudata_all_output_files", 107 output_file = TmpFile("icudata.lst"), 108 include_tmp = False 109 ) 110 ] 111 112 return requests 113 114 115def generate_cnvalias(config, io, common_vars): 116 # UConv Name Aliases 117 input_file = InFile("mappings/convrtrs.txt") 118 output_file = OutFile("cnvalias.icu") 119 return [ 120 SingleExecutionRequest( 121 name = "cnvalias", 122 category = "cnvalias", 123 dep_targets = [], 124 input_files = [input_file], 125 output_files = [output_file], 126 tool = IcuTool("gencnval"), 127 args = "-s {IN_DIR} -d {OUT_DIR} " 128 "{INPUT_FILES[0]}", 129 format_with = {} 130 ) 131 ] 132 133 134def generate_confusables(config, io, common_vars): 135 # CONFUSABLES 136 txt1 = InFile("unidata/confusables.txt") 137 txt2 = InFile("unidata/confusablesWholeScript.txt") 138 cfu = OutFile("confusables.cfu") 139 return [ 140 SingleExecutionRequest( 141 name = "confusables", 142 category = "confusables", 143 dep_targets = [DepTarget("cnvalias")], 144 input_files = [txt1, txt2], 145 output_files = [cfu], 146 tool = IcuTool("gencfu"), 147 args = "-d {OUT_DIR} -i {OUT_DIR} " 148 "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} " 149 "-o {OUTPUT_FILES[0]}", 150 format_with = {} 151 ) 152 ] 153 154 155def generate_conversion_mappings(config, io, common_vars): 156 # UConv Conversion Table Files 157 input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")] 158 output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files] 159 # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv 160 return [ 161 RepeatedOrSingleExecutionRequest( 162 name = "conversion_mappings", 163 category = "conversion_mappings", 164 dep_targets = [], 165 input_files = input_files, 166 output_files = output_files, 167 tool = IcuTool("makeconv"), 168 # BEGIN android-changed 169 # args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}", 170 args = "-s {IN_DIR} -d {OUT_DIR} -c --small {INPUT_FILE_PLACEHOLDER}", 171 # END android-changed 172 format_with = {}, 173 repeat_with = { 174 "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files) 175 } 176 ) 177 ] 178 179 180def generate_brkitr_brk(config, io, common_vars): 181 # BRK Files 182 input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")] 183 output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files] 184 return [ 185 RepeatedExecutionRequest( 186 name = "brkitr_brk", 187 category = "brkitr_rules", 188 dep_targets = 189 [DepTarget("cnvalias"), 190 DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")], 191 input_files = input_files, 192 output_files = output_files, 193 tool = IcuTool("genbrk"), 194 args = "-d {OUT_DIR} -i {OUT_DIR} " 195 "-c -r {IN_DIR}/{INPUT_FILE} " 196 "-o {OUTPUT_FILE}", 197 format_with = {}, 198 repeat_with = {} 199 ) 200 ] 201 202 203def generate_stringprep(config, io, common_vars): 204 # SPP FILES 205 input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")] 206 output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files] 207 bundle_names = [v.filename[6:-4] for v in input_files] 208 return [ 209 RepeatedExecutionRequest( 210 name = "stringprep", 211 category = "stringprep", 212 dep_targets = [InFile("unidata/NormalizationCorrections.txt")], 213 input_files = input_files, 214 output_files = output_files, 215 tool = IcuTool("gensprep"), 216 args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} " 217 "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt", 218 format_with = {}, 219 repeat_with = { 220 "BUNDLE_NAME": bundle_names 221 } 222 ) 223 ] 224 225 226def generate_brkitr_dictionaries(config, io, common_vars): 227 # Dict Files 228 input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")] 229 output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files] 230 extra_options_map = { 231 "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000", 232 "brkitr/dictionaries/cjdict.txt": "--uchars", 233 "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780", 234 "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80", 235 "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00" 236 } 237 extra_optionses = [extra_options_map[v.filename] for v in input_files] 238 return [ 239 RepeatedExecutionRequest( 240 name = "dictionaries", 241 category = "brkitr_dictionaries", 242 dep_targets = [], 243 input_files = input_files, 244 output_files = output_files, 245 tool = IcuTool("gendict"), 246 args = "-i {OUT_DIR} " 247 "-c {EXTRA_OPTIONS} " 248 "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 249 format_with = {}, 250 repeat_with = { 251 "EXTRA_OPTIONS": extra_optionses 252 } 253 ) 254 ] 255 256 257def generate_normalization(config, io, common_vars): 258 # NRM Files 259 input_files = [InFile(filename) for filename in io.glob("in/*.nrm")] 260 # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data 261 input_files.remove(InFile("in/nfc.nrm")) 262 output_files = [OutFile(v.filename[3:]) for v in input_files] 263 return [ 264 RepeatedExecutionRequest( 265 name = "normalization", 266 category = "normalization", 267 dep_targets = [], 268 input_files = input_files, 269 output_files = output_files, 270 tool = IcuTool("icupkg"), 271 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 272 format_with = {}, 273 repeat_with = {} 274 ) 275 ] 276 277 278def generate_coll_ucadata(config, io, common_vars): 279 # Collation Dependency File (ucadata.icu) 280 input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type) 281 output_file = OutFile("coll/ucadata.icu") 282 return [ 283 SingleExecutionRequest( 284 name = "coll_ucadata", 285 category = "coll_ucadata", 286 dep_targets = [], 287 input_files = [input_file], 288 output_files = [output_file], 289 tool = IcuTool("icupkg"), 290 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 291 format_with = {} 292 ) 293 ] 294 295 296def generate_full_unicore_data(config, io, common_vars): 297 # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu) 298 # are hardcoded in the common DLL and therefore not included in the data package any more. 299 # They are not built by default but need to be built for ICU4J data, 300 # both in the .jar and in the .dat file (if ICU4J uses the .dat file). 301 # See ICU-4497. 302 if not config.include_uni_core_data: 303 return [] 304 305 basenames = [ 306 "pnames.icu", 307 "uprops.icu", 308 "ucase.icu", 309 "ubidi.icu", 310 "nfc.nrm" 311 ] 312 input_files = [InFile("in/%s" % bn) for bn in basenames] 313 output_files = [OutFile(bn) for bn in basenames] 314 return [ 315 RepeatedExecutionRequest( 316 name = "unicore", 317 category = "unicore", 318 input_files = input_files, 319 output_files = output_files, 320 tool = IcuTool("icupkg"), 321 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}" 322 ) 323 ] 324 325 326def generate_unames(config, io, common_vars): 327 # Unicode Character Names 328 input_file = InFile("in/unames.icu") 329 output_file = OutFile("unames.icu") 330 return [ 331 SingleExecutionRequest( 332 name = "unames", 333 category = "unames", 334 dep_targets = [], 335 input_files = [input_file], 336 output_files = [output_file], 337 tool = IcuTool("icupkg"), 338 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 339 format_with = {} 340 ) 341 ] 342 343 344def generate_ulayout(config, io, common_vars): 345 # Unicode text layout properties 346 basename = "ulayout" 347 input_file = InFile("in/%s.icu" % basename) 348 output_file = OutFile("%s.icu" % basename) 349 return [ 350 SingleExecutionRequest( 351 name = basename, 352 category = basename, 353 dep_targets = [], 354 input_files = [input_file], 355 output_files = [output_file], 356 tool = IcuTool("icupkg"), 357 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 358 format_with = {} 359 ) 360 ] 361 362 363def generate_uemoji(config, io, common_vars): 364 # Unicode emoji properties 365 basename = "uemoji" 366 input_file = InFile("in/%s.icu" % basename) 367 output_file = OutFile("%s.icu" % basename) 368 return [ 369 SingleExecutionRequest( 370 name = basename, 371 category = basename, 372 dep_targets = [], 373 input_files = [input_file], 374 output_files = [output_file], 375 tool = IcuTool("icupkg"), 376 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 377 format_with = {} 378 ) 379 ] 380 381 382def generate_misc(config, io, common_vars): 383 # Misc Data Res Files 384 input_files = [InFile(filename) for filename in io.glob("misc/*.txt")] 385 input_basenames = [v.filename[5:] for v in input_files] 386 output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames] 387 return [ 388 RepeatedExecutionRequest( 389 name = "misc_res", 390 category = "misc", 391 dep_targets = [DepTarget("cnvalias")], # ICU-21175 392 input_files = input_files, 393 output_files = output_files, 394 tool = IcuTool("genrb"), 395 args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} " 396 "-k -q " 397 "{INPUT_BASENAME}", 398 format_with = {}, 399 repeat_with = { 400 "INPUT_BASENAME": input_basenames 401 } 402 ) 403 ] 404 405 406def generate_curr_supplemental(config, io, common_vars): 407 # Currency Supplemental Res File 408 input_file = InFile("curr/supplementalData.txt") 409 input_basename = "supplementalData.txt" 410 output_file = OutFile("curr/supplementalData.res") 411 return [ 412 SingleExecutionRequest( 413 name = "curr_supplemental_res", 414 category = "curr_supplemental", 415 dep_targets = [], 416 input_files = [input_file], 417 output_files = [output_file], 418 tool = IcuTool("genrb"), 419 args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} " 420 "-k " 421 "{INPUT_BASENAME}", 422 format_with = { 423 "INPUT_BASENAME": input_basename 424 } 425 ) 426 ] 427 428 429def generate_zone_supplemental(config, io, common_vars): 430 # tzdbNames Res File 431 input_file = InFile("zone/tzdbNames.txt") 432 input_basename = "tzdbNames.txt" 433 output_file = OutFile("zone/tzdbNames.res") 434 return [ 435 SingleExecutionRequest( 436 name = "zone_supplemental_res", 437 category = "zone_supplemental", 438 dep_targets = [], 439 input_files = [input_file], 440 output_files = [output_file], 441 tool = IcuTool("genrb"), 442 args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} " 443 "-k " 444 "{INPUT_BASENAME}", 445 format_with = { 446 "INPUT_BASENAME": input_basename 447 } 448 ) 449 ] 450 451 452def generate_translit(config, io, common_vars): 453 input_files = [ 454 InFile("translit/root.txt"), 455 InFile("translit/en.txt"), 456 InFile("translit/el.txt") 457 ] 458 dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt")) 459 dep_files -= set(input_files) 460 dep_files = list(sorted(dep_files)) 461 input_basenames = [v.filename[9:] for v in input_files] 462 output_files = [ 463 OutFile("translit/%s.res" % v[:-4]) 464 for v in input_basenames 465 ] 466 return [ 467 RepeatedOrSingleExecutionRequest( 468 name = "translit_res", 469 category = "translit", 470 dep_targets = dep_files, 471 input_files = input_files, 472 output_files = output_files, 473 tool = IcuTool("genrb"), 474 args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} " 475 "-k " 476 "{INPUT_BASENAME}", 477 format_with = { 478 }, 479 repeat_with = { 480 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 481 } 482 ) 483 ] 484 485 486def generate_brkitr_lstm(config, io, common_vars): 487 input_files = [InFile(filename) for filename in io.glob("brkitr/lstm/*.txt")] 488 input_basenames = [v.filename[12:] for v in input_files] 489 output_files = [ 490 OutFile("brkitr/%s.res" % v[:-4]) 491 for v in input_basenames 492 ] 493 return [ 494 RepeatedOrSingleExecutionRequest( 495 name = "lstm_res", 496 category = "brkitr_lstm", 497 dep_targets = [], 498 input_files = input_files, 499 output_files = output_files, 500 tool = IcuTool("genrb"), 501 args = "-s {IN_DIR}/brkitr/lstm -d {OUT_DIR}/brkitr -i {OUT_DIR} " 502 "-k " 503 "{INPUT_BASENAME}", 504 format_with = { 505 }, 506 repeat_with = { 507 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 508 } 509 ) 510 ] 511 512def generate_tree( 513 config, 514 io, 515 common_vars, 516 sub_dir, 517 out_sub_dir, 518 use_pool_bundle, 519 dep_targets): 520 requests = [] 521 category = "%s_tree" % sub_dir 522 out_prefix = "%s/" % out_sub_dir if out_sub_dir else "" 523 input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)] 524 if sub_dir == "curr": 525 input_files.remove(InFile("curr/supplementalData.txt")) 526 if sub_dir == "zone": 527 input_files.remove(InFile("zone/tzdbNames.txt")) 528 input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files] 529 output_files = [ 530 OutFile("%s%s.res" % (out_prefix, v[:-4])) 531 for v in input_basenames 532 ] 533 534 # Generate Pool Bundle 535 if use_pool_bundle: 536 input_pool_files = [OutFile("%spool.res" % out_prefix)] 537 pool_target_name = "%s_pool_write" % sub_dir 538 use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format( 539 OUT_PREFIX = out_prefix, 540 **common_vars 541 ) 542 requests += [ 543 SingleExecutionRequest( 544 name = pool_target_name, 545 category = category, 546 dep_targets = dep_targets, 547 input_files = input_files, 548 output_files = input_pool_files, 549 tool = IcuTool("genrb"), 550 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 551 "--writePoolBundle -k " 552 "{INPUT_BASENAMES_SPACED}", 553 format_with = { 554 "IN_SUB_DIR": sub_dir, 555 "OUT_PREFIX": out_prefix, 556 "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames) 557 } 558 ), 559 ] 560 dep_targets = dep_targets + [DepTarget(pool_target_name)] 561 else: 562 use_pool_bundle_option = "" 563 564 # Generate Res File Tree 565 requests += [ 566 RepeatedOrSingleExecutionRequest( 567 name = "%s_res" % sub_dir, 568 category = category, 569 dep_targets = dep_targets, 570 input_files = input_files, 571 output_files = output_files, 572 tool = IcuTool("genrb"), 573 # BEGIN android-changed 574 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " + 575 ("--omitCollationRules " if sub_dir == "coll" else "") + 576 "{EXTRA_OPTION} -k " 577 "{INPUT_BASENAME}", 578 # END android-changed 579 format_with = { 580 "IN_SUB_DIR": sub_dir, 581 "OUT_PREFIX": out_prefix, 582 "EXTRA_OPTION": use_pool_bundle_option 583 }, 584 repeat_with = { 585 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 586 } 587 ) 588 ] 589 590 # Generate res_index file 591 # Exclude the deprecated locale variants and root; see ICU-20628. This 592 # could be data-driven, but we do not want to perform I/O in this script 593 # (for example, we do not want to read from an XML file). 594 excluded_locales = set([ 595 "ja_JP_TRADITIONAL", 596 "th_TH_TRADITIONAL", 597 "de_", 598 "de__PHONEBOOK", 599 "es_", 600 "es__TRADITIONAL", 601 "root", 602 ]) 603 # Put alias locales in a separate structure; see ICU-20627 604 dependency_data = io.read_locale_deps(sub_dir) 605 if "aliases" in dependency_data: 606 alias_locales = set(dependency_data["aliases"].keys()) 607 else: 608 alias_locales = set() 609 alias_files = [] 610 installed_files = [] 611 for f in input_files: 612 file_stem = IndexRequest.locale_file_stem(f) 613 if file_stem in excluded_locales: 614 continue 615 destination = alias_files if file_stem in alias_locales else installed_files 616 destination.append(f) 617 cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None 618 index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format( 619 IN_SUB_DIR = sub_dir, 620 **common_vars 621 )) 622 index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format( 623 OUT_PREFIX = out_prefix, 624 **common_vars 625 )) 626 index_file_target_name = "%s_index_txt" % sub_dir 627 requests += [ 628 IndexRequest( 629 name = index_file_target_name, 630 category = category, 631 installed_files = installed_files, 632 alias_files = alias_files, 633 txt_file = index_file_txt, 634 output_file = index_res_file, 635 cldr_version = cldr_version, 636 args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 637 "-k " 638 "{INDEX_NAME}.txt", 639 format_with = { 640 "IN_SUB_DIR": sub_dir, 641 "OUT_PREFIX": out_prefix 642 } 643 ) 644 ] 645 646 return requests 647