1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from icutools.databuilder import * 9from icutools.databuilder import utils 10from icutools.databuilder.request_types import * 11 12import os 13import sys 14 15 16def generate(config, io, common_vars): 17 requests = [] 18 19 if len(io.glob("misc/*")) == 0: 20 print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr) 21 exit(1) 22 23 requests += generate_cnvalias(config, io, common_vars) 24 requests += generate_ulayout(config, io, common_vars) 25 requests += generate_confusables(config, io, common_vars) 26 requests += generate_conversion_mappings(config, io, common_vars) 27 requests += generate_brkitr_brk(config, io, common_vars) 28 requests += generate_stringprep(config, io, common_vars) 29 requests += generate_brkitr_dictionaries(config, io, common_vars) 30 requests += generate_normalization(config, io, common_vars) 31 requests += generate_coll_ucadata(config, io, common_vars) 32 requests += generate_full_unicore_data(config, io, common_vars) 33 requests += generate_unames(config, io, common_vars) 34 requests += generate_misc(config, io, common_vars) 35 requests += generate_curr_supplemental(config, io, common_vars) 36 requests += generate_zone_supplemental(config, io, common_vars) 37 requests += generate_translit(config, io, common_vars) 38 39 # Res Tree Files 40 # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files) 41 requests += generate_tree(config, io, common_vars, 42 "locales", 43 None, 44 config.use_pool_bundle, 45 []) 46 47 requests += generate_tree(config, io, common_vars, 48 "curr", 49 "curr", 50 config.use_pool_bundle, 51 []) 52 53 requests += generate_tree(config, io, common_vars, 54 "lang", 55 "lang", 56 config.use_pool_bundle, 57 []) 58 59 requests += generate_tree(config, io, common_vars, 60 "region", 61 "region", 62 config.use_pool_bundle, 63 []) 64 65 requests += generate_tree(config, io, common_vars, 66 "zone", 67 "zone", 68 config.use_pool_bundle, 69 []) 70 71 requests += generate_tree(config, io, common_vars, 72 "unit", 73 "unit", 74 config.use_pool_bundle, 75 []) 76 77 requests += generate_tree(config, io, common_vars, 78 "coll", 79 "coll", 80 # Never use pool bundle for coll, brkitr, or rbnf 81 False, 82 # Depends on timezoneTypes.res and keyTypeData.res. 83 # TODO: We should not need this dependency to build collation. 84 # TODO: Bake keyTypeData.res into the common library? 85 [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")]) 86 87 requests += generate_tree(config, io, common_vars, 88 "brkitr", 89 "brkitr", 90 # Never use pool bundle for coll, brkitr, or rbnf 91 False, 92 [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) 93 94 requests += generate_tree(config, io, common_vars, 95 "rbnf", 96 "rbnf", 97 # Never use pool bundle for coll, brkitr, or rbnf 98 False, 99 []) 100 101 requests += [ 102 ListRequest( 103 name = "icudata_list", 104 variable_name = "icudata_all_output_files", 105 output_file = TmpFile("icudata.lst"), 106 include_tmp = False 107 ) 108 ] 109 110 return requests 111 112 113def generate_cnvalias(config, io, common_vars): 114 # UConv Name Aliases 115 input_file = InFile("mappings/convrtrs.txt") 116 output_file = OutFile("cnvalias.icu") 117 return [ 118 SingleExecutionRequest( 119 name = "cnvalias", 120 category = "cnvalias", 121 dep_targets = [], 122 input_files = [input_file], 123 output_files = [output_file], 124 tool = IcuTool("gencnval"), 125 args = "-s {IN_DIR} -d {OUT_DIR} " 126 "{INPUT_FILES[0]}", 127 format_with = {} 128 ) 129 ] 130 131 132def generate_confusables(config, io, common_vars): 133 # CONFUSABLES 134 txt1 = InFile("unidata/confusables.txt") 135 txt2 = InFile("unidata/confusablesWholeScript.txt") 136 cfu = OutFile("confusables.cfu") 137 return [ 138 SingleExecutionRequest( 139 name = "confusables", 140 category = "confusables", 141 dep_targets = [DepTarget("cnvalias")], 142 input_files = [txt1, txt2], 143 output_files = [cfu], 144 tool = IcuTool("gencfu"), 145 args = "-d {OUT_DIR} -i {OUT_DIR} " 146 "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} " 147 "-o {OUTPUT_FILES[0]}", 148 format_with = {} 149 ) 150 ] 151 152 153def generate_conversion_mappings(config, io, common_vars): 154 # UConv Conversion Table Files 155 input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")] 156 output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files] 157 # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv 158 return [ 159 RepeatedOrSingleExecutionRequest( 160 name = "conversion_mappings", 161 category = "conversion_mappings", 162 dep_targets = [], 163 input_files = input_files, 164 output_files = output_files, 165 tool = IcuTool("makeconv"), 166 args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}", 167 format_with = {}, 168 repeat_with = { 169 "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files) 170 } 171 ) 172 ] 173 174 175def generate_brkitr_brk(config, io, common_vars): 176 # BRK Files 177 input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")] 178 output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files] 179 return [ 180 RepeatedExecutionRequest( 181 name = "brkitr_brk", 182 category = "brkitr_rules", 183 dep_targets = [DepTarget("cnvalias"), DepTarget("ulayout")], 184 input_files = input_files, 185 output_files = output_files, 186 tool = IcuTool("genbrk"), 187 args = "-d {OUT_DIR} -i {OUT_DIR} " 188 "-c -r {IN_DIR}/{INPUT_FILE} " 189 "-o {OUTPUT_FILE}", 190 format_with = {}, 191 repeat_with = {} 192 ) 193 ] 194 195 196def generate_stringprep(config, io, common_vars): 197 # SPP FILES 198 input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")] 199 output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files] 200 bundle_names = [v.filename[6:-4] for v in input_files] 201 return [ 202 RepeatedExecutionRequest( 203 name = "stringprep", 204 category = "stringprep", 205 dep_targets = [InFile("unidata/NormalizationCorrections.txt")], 206 input_files = input_files, 207 output_files = output_files, 208 tool = IcuTool("gensprep"), 209 args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} " 210 "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt", 211 format_with = {}, 212 repeat_with = { 213 "BUNDLE_NAME": bundle_names 214 } 215 ) 216 ] 217 218 219def generate_brkitr_dictionaries(config, io, common_vars): 220 # Dict Files 221 input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")] 222 output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files] 223 extra_options_map = { 224 "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000", 225 "brkitr/dictionaries/cjdict.txt": "--uchars", 226 "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780", 227 "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80", 228 "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00" 229 } 230 extra_optionses = [extra_options_map[v.filename] for v in input_files] 231 return [ 232 RepeatedExecutionRequest( 233 name = "dictionaries", 234 category = "brkitr_dictionaries", 235 dep_targets = [], 236 input_files = input_files, 237 output_files = output_files, 238 tool = IcuTool("gendict"), 239 args = "-i {OUT_DIR} " 240 "-c {EXTRA_OPTIONS} " 241 "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 242 format_with = {}, 243 repeat_with = { 244 "EXTRA_OPTIONS": extra_optionses 245 } 246 ) 247 ] 248 249 250def generate_normalization(config, io, common_vars): 251 # NRM Files 252 input_files = [InFile(filename) for filename in io.glob("in/*.nrm")] 253 # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data 254 input_files.remove(InFile("in/nfc.nrm")) 255 output_files = [OutFile(v.filename[3:]) for v in input_files] 256 return [ 257 RepeatedExecutionRequest( 258 name = "normalization", 259 category = "normalization", 260 dep_targets = [], 261 input_files = input_files, 262 output_files = output_files, 263 tool = IcuTool("icupkg"), 264 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", 265 format_with = {}, 266 repeat_with = {} 267 ) 268 ] 269 270 271def generate_coll_ucadata(config, io, common_vars): 272 # Collation Dependency File (ucadata.icu) 273 input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type) 274 output_file = OutFile("coll/ucadata.icu") 275 return [ 276 SingleExecutionRequest( 277 name = "coll_ucadata", 278 category = "coll_ucadata", 279 dep_targets = [], 280 input_files = [input_file], 281 output_files = [output_file], 282 tool = IcuTool("icupkg"), 283 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 284 format_with = {} 285 ) 286 ] 287 288 289def generate_full_unicore_data(config, io, common_vars): 290 # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu) 291 # are hardcoded in the common DLL and therefore not included in the data package any more. 292 # They are not built by default but need to be built for ICU4J data, 293 # both in the .jar and in the .dat file (if ICU4J uses the .dat file). 294 # See ICU-4497. 295 if not config.include_uni_core_data: 296 return [] 297 298 basenames = [ 299 "pnames.icu", 300 "uprops.icu", 301 "ucase.icu", 302 "ubidi.icu", 303 "nfc.nrm" 304 ] 305 input_files = [InFile("in/%s" % bn) for bn in basenames] 306 output_files = [OutFile(bn) for bn in basenames] 307 return [ 308 RepeatedExecutionRequest( 309 name = "unicore", 310 category = "unicore", 311 input_files = input_files, 312 output_files = output_files, 313 tool = IcuTool("icupkg"), 314 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}" 315 ) 316 ] 317 318 319def generate_unames(config, io, common_vars): 320 # Unicode Character Names 321 input_file = InFile("in/unames.icu") 322 output_file = OutFile("unames.icu") 323 return [ 324 SingleExecutionRequest( 325 name = "unames", 326 category = "unames", 327 dep_targets = [], 328 input_files = [input_file], 329 output_files = [output_file], 330 tool = IcuTool("icupkg"), 331 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 332 format_with = {} 333 ) 334 ] 335 336 337def generate_ulayout(config, io, common_vars): 338 # Unicode text layout properties 339 basename = "ulayout" 340 input_file = InFile("in/%s.icu" % basename) 341 output_file = OutFile("%s.icu" % basename) 342 return [ 343 SingleExecutionRequest( 344 name = basename, 345 category = basename, 346 dep_targets = [], 347 input_files = [input_file], 348 output_files = [output_file], 349 tool = IcuTool("icupkg"), 350 args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", 351 format_with = {} 352 ) 353 ] 354 355 356def generate_misc(config, io, common_vars): 357 # Misc Data Res Files 358 input_files = [InFile(filename) for filename in io.glob("misc/*.txt")] 359 input_basenames = [v.filename[5:] for v in input_files] 360 output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames] 361 return [ 362 RepeatedExecutionRequest( 363 name = "misc_res", 364 category = "misc", 365 dep_targets = [DepTarget("cnvalias")], # ICU-21175 366 input_files = input_files, 367 output_files = output_files, 368 tool = IcuTool("genrb"), 369 args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} " 370 "-k -q " 371 "{INPUT_BASENAME}", 372 format_with = {}, 373 repeat_with = { 374 "INPUT_BASENAME": input_basenames 375 } 376 ) 377 ] 378 379 380def generate_curr_supplemental(config, io, common_vars): 381 # Currency Supplemental Res File 382 input_file = InFile("curr/supplementalData.txt") 383 input_basename = "supplementalData.txt" 384 output_file = OutFile("curr/supplementalData.res") 385 return [ 386 SingleExecutionRequest( 387 name = "curr_supplemental_res", 388 category = "curr_supplemental", 389 dep_targets = [], 390 input_files = [input_file], 391 output_files = [output_file], 392 tool = IcuTool("genrb"), 393 args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} " 394 "-k " 395 "{INPUT_BASENAME}", 396 format_with = { 397 "INPUT_BASENAME": input_basename 398 } 399 ) 400 ] 401 402 403def generate_zone_supplemental(config, io, common_vars): 404 # tzdbNames Res File 405 input_file = InFile("zone/tzdbNames.txt") 406 input_basename = "tzdbNames.txt" 407 output_file = OutFile("zone/tzdbNames.res") 408 return [ 409 SingleExecutionRequest( 410 name = "zone_supplemental_res", 411 category = "zone_supplemental", 412 dep_targets = [], 413 input_files = [input_file], 414 output_files = [output_file], 415 tool = IcuTool("genrb"), 416 args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} " 417 "-k " 418 "{INPUT_BASENAME}", 419 format_with = { 420 "INPUT_BASENAME": input_basename 421 } 422 ) 423 ] 424 425 426def generate_translit(config, io, common_vars): 427 input_files = [ 428 InFile("translit/root.txt"), 429 InFile("translit/en.txt"), 430 InFile("translit/el.txt") 431 ] 432 dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt")) 433 dep_files -= set(input_files) 434 dep_files = list(sorted(dep_files)) 435 input_basenames = [v.filename[9:] for v in input_files] 436 output_files = [ 437 OutFile("translit/%s.res" % v[:-4]) 438 for v in input_basenames 439 ] 440 return [ 441 RepeatedOrSingleExecutionRequest( 442 name = "translit_res", 443 category = "translit", 444 dep_targets = dep_files, 445 input_files = input_files, 446 output_files = output_files, 447 tool = IcuTool("genrb"), 448 args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} " 449 "-k " 450 "{INPUT_BASENAME}", 451 format_with = { 452 }, 453 repeat_with = { 454 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 455 } 456 ) 457 ] 458 459 460def generate_tree( 461 config, 462 io, 463 common_vars, 464 sub_dir, 465 out_sub_dir, 466 use_pool_bundle, 467 dep_targets): 468 requests = [] 469 category = "%s_tree" % sub_dir 470 out_prefix = "%s/" % out_sub_dir if out_sub_dir else "" 471 input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)] 472 if sub_dir == "curr": 473 input_files.remove(InFile("curr/supplementalData.txt")) 474 if sub_dir == "zone": 475 input_files.remove(InFile("zone/tzdbNames.txt")) 476 input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files] 477 output_files = [ 478 OutFile("%s%s.res" % (out_prefix, v[:-4])) 479 for v in input_basenames 480 ] 481 482 # Generate Pool Bundle 483 if use_pool_bundle: 484 input_pool_files = [OutFile("%spool.res" % out_prefix)] 485 pool_target_name = "%s_pool_write" % sub_dir 486 use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format( 487 OUT_PREFIX = out_prefix, 488 **common_vars 489 ) 490 requests += [ 491 SingleExecutionRequest( 492 name = pool_target_name, 493 category = category, 494 dep_targets = dep_targets, 495 input_files = input_files, 496 output_files = input_pool_files, 497 tool = IcuTool("genrb"), 498 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 499 "--writePoolBundle -k " 500 "{INPUT_BASENAMES_SPACED}", 501 format_with = { 502 "IN_SUB_DIR": sub_dir, 503 "OUT_PREFIX": out_prefix, 504 "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames) 505 } 506 ), 507 ] 508 dep_targets = dep_targets + [DepTarget(pool_target_name)] 509 else: 510 use_pool_bundle_option = "" 511 512 # Generate Res File Tree 513 requests += [ 514 RepeatedOrSingleExecutionRequest( 515 name = "%s_res" % sub_dir, 516 category = category, 517 dep_targets = dep_targets, 518 input_files = input_files, 519 output_files = output_files, 520 tool = IcuTool("genrb"), 521 args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 522 "{EXTRA_OPTION} -k " 523 "{INPUT_BASENAME}", 524 format_with = { 525 "IN_SUB_DIR": sub_dir, 526 "OUT_PREFIX": out_prefix, 527 "EXTRA_OPTION": use_pool_bundle_option 528 }, 529 repeat_with = { 530 "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) 531 } 532 ) 533 ] 534 535 # Generate res_index file 536 # Exclude the deprecated locale variants and root; see ICU-20628. This 537 # could be data-driven, but we do not want to perform I/O in this script 538 # (for example, we do not want to read from an XML file). 539 excluded_locales = set([ 540 "ja_JP_TRADITIONAL", 541 "th_TH_TRADITIONAL", 542 "de_", 543 "de__PHONEBOOK", 544 "es_", 545 "es__TRADITIONAL", 546 "root", 547 ]) 548 # Put alias locales in a separate structure; see ICU-20627 549 dependency_data = io.read_locale_deps(sub_dir) 550 if "aliases" in dependency_data: 551 alias_locales = set(dependency_data["aliases"].keys()) 552 else: 553 alias_locales = set() 554 alias_files = [] 555 installed_files = [] 556 for f in input_files: 557 file_stem = IndexRequest.locale_file_stem(f) 558 if file_stem in excluded_locales: 559 continue 560 destination = alias_files if file_stem in alias_locales else installed_files 561 destination.append(f) 562 cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None 563 index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format( 564 IN_SUB_DIR = sub_dir, 565 **common_vars 566 )) 567 index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format( 568 OUT_PREFIX = out_prefix, 569 **common_vars 570 )) 571 index_file_target_name = "%s_index_txt" % sub_dir 572 requests += [ 573 IndexRequest( 574 name = index_file_target_name, 575 category = category, 576 installed_files = installed_files, 577 alias_files = alias_files, 578 txt_file = index_file_txt, 579 output_file = index_res_file, 580 cldr_version = cldr_version, 581 args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " 582 "-k " 583 "{INDEX_NAME}.txt", 584 format_with = { 585 "IN_SUB_DIR": sub_dir, 586 "OUT_PREFIX": out_prefix 587 } 588 ) 589 ] 590 591 return requests 592