• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (C) 2018 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html
3
4# Python 2/3 Compatibility (ICU-20299)
5# TODO(ICU-20301): Remove this.
6from __future__ import print_function
7
8from icutools.databuilder import *
9from icutools.databuilder import utils
10from icutools.databuilder.request_types import *
11
12import os
13import sys
14
15
16def generate(config, io, common_vars):
17    requests = []
18
19    if len(io.glob("misc/*")) == 0:
20        print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr)
21        exit(1)
22
23    requests += generate_cnvalias(config, io, common_vars)
24    requests += generate_ulayout(config, io, common_vars)
25    requests += generate_uemoji(config, io, common_vars)
26    requests += generate_confusables(config, io, common_vars)
27    requests += generate_conversion_mappings(config, io, common_vars)
28    requests += generate_brkitr_brk(config, io, common_vars)
29    requests += generate_brkitr_lstm(config, io, common_vars)
30    requests += generate_brkitr_adaboost(config, io, common_vars)
31    requests += generate_stringprep(config, io, common_vars)
32    requests += generate_brkitr_dictionaries(config, io, common_vars)
33    requests += generate_normalization(config, io, common_vars)
34    requests += generate_coll_ucadata(config, io, common_vars)
35    requests += generate_full_unicore_data(config, io, common_vars)
36    requests += generate_unames(config, io, common_vars)
37    requests += generate_misc(config, io, common_vars)
38    requests += generate_curr_supplemental(config, io, common_vars)
39    requests += generate_zone_supplemental(config, io, common_vars)
40    requests += generate_translit(config, io, common_vars)
41
42    # Res Tree Files
43    # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files)
44    requests += generate_tree(config, io, common_vars,
45        "locales",
46        None,
47        config.use_pool_bundle,
48        [])
49
50    requests += generate_tree(config, io, common_vars,
51        "curr",
52        "curr",
53        config.use_pool_bundle,
54        [])
55
56    requests += generate_tree(config, io, common_vars,
57        "lang",
58        "lang",
59        config.use_pool_bundle,
60        [])
61
62    requests += generate_tree(config, io, common_vars,
63        "region",
64        "region",
65        config.use_pool_bundle,
66        [])
67
68    requests += generate_tree(config, io, common_vars,
69        "zone",
70        "zone",
71        config.use_pool_bundle,
72        [])
73
74    requests += generate_tree(config, io, common_vars,
75        "unit",
76        "unit",
77        config.use_pool_bundle,
78        [])
79
80    requests += generate_tree(config, io, common_vars,
81        "coll",
82        "coll",
83        # Never use pool bundle for coll, brkitr, or rbnf
84        False,
85        # Depends on timezoneTypes.res and keyTypeData.res.
86        # TODO: We should not need this dependency to build collation.
87        # TODO: Bake keyTypeData.res into the common library?
88        [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")])
89
90    requests += generate_tree(config, io, common_vars,
91        "brkitr",
92        "brkitr",
93        # Never use pool bundle for coll, brkitr, or rbnf
94        False,
95        [DepTarget("brkitr_brk"), DepTarget("dictionaries")])
96
97    requests += generate_tree(config, io, common_vars,
98        "rbnf",
99        "rbnf",
100        # Never use pool bundle for coll, brkitr, or rbnf
101        False,
102        [])
103
104    requests += [
105        ListRequest(
106            name = "icudata_list",
107            variable_name = "icudata_all_output_files",
108            output_file = TmpFile("icudata.lst"),
109            include_tmp = False
110        )
111    ]
112
113    return requests
114
115
116def generate_cnvalias(config, io, common_vars):
117    # UConv Name Aliases
118    input_file = InFile("mappings/convrtrs.txt")
119    output_file = OutFile("cnvalias.icu")
120    return [
121        SingleExecutionRequest(
122            name = "cnvalias",
123            category = "cnvalias",
124            dep_targets = [],
125            input_files = [input_file],
126            output_files = [output_file],
127            tool = IcuTool("gencnval"),
128            args = "-s {IN_DIR} -d {OUT_DIR} "
129                "{INPUT_FILES[0]}",
130            format_with = {}
131        )
132    ]
133
134
135def generate_confusables(config, io, common_vars):
136    # CONFUSABLES
137    txt1 = InFile("unidata/confusables.txt")
138    txt2 = InFile("unidata/confusablesWholeScript.txt")
139    cfu = OutFile("confusables.cfu")
140    return [
141        SingleExecutionRequest(
142            name = "confusables",
143            category = "confusables",
144            dep_targets = [DepTarget("cnvalias")],
145            input_files = [txt1, txt2],
146            output_files = [cfu],
147            tool = IcuTool("gencfu"),
148            args = "-d {OUT_DIR} -i {OUT_DIR} "
149                "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} "
150                "-o {OUTPUT_FILES[0]}",
151            format_with = {}
152        )
153    ]
154
155
156def generate_conversion_mappings(config, io, common_vars):
157    # UConv Conversion Table Files
158    input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")]
159    output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files]
160    # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv
161    return [
162        RepeatedOrSingleExecutionRequest(
163            name = "conversion_mappings",
164            category = "conversion_mappings",
165            dep_targets = [],
166            input_files = input_files,
167            output_files = output_files,
168            tool = IcuTool("makeconv"),
169            args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}",
170            format_with = {},
171            repeat_with = {
172                "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files)
173            }
174        )
175    ]
176
177
178def generate_brkitr_brk(config, io, common_vars):
179    # BRK Files
180    input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")]
181    output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files]
182    return [
183        RepeatedExecutionRequest(
184            name = "brkitr_brk",
185            category = "brkitr_rules",
186            dep_targets =
187                [DepTarget("cnvalias"),
188                    DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")],
189            input_files = input_files,
190            output_files = output_files,
191            tool = IcuTool("genbrk"),
192            args = "-d {OUT_DIR} -i {OUT_DIR} "
193                "-c -r {IN_DIR}/{INPUT_FILE} "
194                "-o {OUTPUT_FILE}",
195            format_with = {},
196            repeat_with = {}
197        )
198    ]
199
200
201def generate_stringprep(config, io, common_vars):
202    # SPP FILES
203    input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")]
204    output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files]
205    bundle_names = [v.filename[6:-4] for v in input_files]
206    return [
207        RepeatedExecutionRequest(
208            name = "stringprep",
209            category = "stringprep",
210            dep_targets = [InFile("unidata/NormalizationCorrections.txt")],
211            input_files = input_files,
212            output_files = output_files,
213            tool = IcuTool("gensprep"),
214            args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} "
215                "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt",
216            format_with = {},
217            repeat_with = {
218                "BUNDLE_NAME": bundle_names
219            }
220        )
221    ]
222
223
224def generate_brkitr_dictionaries(config, io, common_vars):
225    # Dict Files
226    input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")]
227    output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files]
228    extra_options_map = {
229        "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000",
230        "brkitr/dictionaries/cjdict.txt": "--uchars",
231        "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780",
232        "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80",
233        "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00"
234    }
235    extra_optionses = [extra_options_map[v.filename] for v in input_files]
236    return [
237        RepeatedExecutionRequest(
238            name = "dictionaries",
239            category = "brkitr_dictionaries",
240            dep_targets = [],
241            input_files = input_files,
242            output_files = output_files,
243            tool = IcuTool("gendict"),
244            args = "-i {OUT_DIR} "
245                "-c {EXTRA_OPTIONS} "
246                "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}",
247            format_with = {},
248            repeat_with = {
249                "EXTRA_OPTIONS": extra_optionses
250            }
251        )
252    ]
253
254
255def generate_normalization(config, io, common_vars):
256    # NRM Files
257    input_files = [InFile(filename) for filename in io.glob("in/*.nrm")]
258    # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data
259    input_files.remove(InFile("in/nfc.nrm"))
260    output_files = [OutFile(v.filename[3:]) for v in input_files]
261    return [
262        RepeatedExecutionRequest(
263            name = "normalization",
264            category = "normalization",
265            dep_targets = [],
266            input_files = input_files,
267            output_files = output_files,
268            tool = IcuTool("icupkg"),
269            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}",
270            format_with = {},
271            repeat_with = {}
272        )
273    ]
274
275
276def generate_coll_ucadata(config, io, common_vars):
277    # Collation Dependency File (ucadata.icu)
278    input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type)
279    output_file = OutFile("coll/ucadata.icu")
280    return [
281        SingleExecutionRequest(
282            name = "coll_ucadata",
283            category = "coll_ucadata",
284            dep_targets = [],
285            input_files = [input_file],
286            output_files = [output_file],
287            tool = IcuTool("icupkg"),
288            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
289            format_with = {}
290        )
291    ]
292
293
294def generate_full_unicore_data(config, io, common_vars):
295    # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu)
296    # are hardcoded in the common DLL and therefore not included in the data package any more.
297    # They are not built by default but need to be built for ICU4J data,
298    # both in the .jar and in the .dat file (if ICU4J uses the .dat file).
299    # See ICU-4497.
300    if not config.include_uni_core_data:
301        return []
302
303    basenames = [
304        "pnames.icu",
305        "uprops.icu",
306        "ucase.icu",
307        "ubidi.icu",
308        "nfc.nrm"
309    ]
310    input_files = [InFile("in/%s" % bn) for bn in basenames]
311    output_files = [OutFile(bn) for bn in basenames]
312    return [
313        RepeatedExecutionRequest(
314            name = "unicore",
315            category = "unicore",
316            input_files = input_files,
317            output_files = output_files,
318            tool = IcuTool("icupkg"),
319            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}"
320        )
321    ]
322
323
324def generate_unames(config, io, common_vars):
325    # Unicode Character Names
326    input_file = InFile("in/unames.icu")
327    output_file = OutFile("unames.icu")
328    return [
329        SingleExecutionRequest(
330            name = "unames",
331            category = "unames",
332            dep_targets = [],
333            input_files = [input_file],
334            output_files = [output_file],
335            tool = IcuTool("icupkg"),
336            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
337            format_with = {}
338        )
339    ]
340
341
342def generate_ulayout(config, io, common_vars):
343    # Unicode text layout properties
344    basename = "ulayout"
345    input_file = InFile("in/%s.icu" % basename)
346    output_file = OutFile("%s.icu" % basename)
347    return [
348        SingleExecutionRequest(
349            name = basename,
350            category = basename,
351            dep_targets = [],
352            input_files = [input_file],
353            output_files = [output_file],
354            tool = IcuTool("icupkg"),
355            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
356            format_with = {}
357        )
358    ]
359
360
361def generate_uemoji(config, io, common_vars):
362    # Unicode emoji properties
363    basename = "uemoji"
364    input_file = InFile("in/%s.icu" % basename)
365    output_file = OutFile("%s.icu" % basename)
366    return [
367        SingleExecutionRequest(
368            name = basename,
369            category = basename,
370            dep_targets = [],
371            input_files = [input_file],
372            output_files = [output_file],
373            tool = IcuTool("icupkg"),
374            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
375            format_with = {}
376        )
377    ]
378
379
380def generate_misc(config, io, common_vars):
381    # Misc Data Res Files
382    input_files = [InFile(filename) for filename in io.glob("misc/*.txt")]
383    input_basenames = [v.filename[5:] for v in input_files]
384    output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames]
385    return [
386        RepeatedExecutionRequest(
387            name = "misc_res",
388            category = "misc",
389            dep_targets = [DepTarget("cnvalias")], # ICU-21175
390            input_files = input_files,
391            output_files = output_files,
392            tool = IcuTool("genrb"),
393            args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} "
394                "-k -q "
395                "{INPUT_BASENAME}",
396            format_with = {},
397            repeat_with = {
398                "INPUT_BASENAME": input_basenames
399            }
400        )
401    ]
402
403
404def generate_curr_supplemental(config, io, common_vars):
405    # Currency Supplemental Res File
406    input_file = InFile("curr/supplementalData.txt")
407    input_basename = "supplementalData.txt"
408    output_file = OutFile("curr/supplementalData.res")
409    return [
410        SingleExecutionRequest(
411            name = "curr_supplemental_res",
412            category = "curr_supplemental",
413            dep_targets = [],
414            input_files = [input_file],
415            output_files = [output_file],
416            tool = IcuTool("genrb"),
417            args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} "
418                "-k "
419                "{INPUT_BASENAME}",
420            format_with = {
421                "INPUT_BASENAME": input_basename
422            }
423        )
424    ]
425
426
427def generate_zone_supplemental(config, io, common_vars):
428    # tzdbNames Res File
429    input_file = InFile("zone/tzdbNames.txt")
430    input_basename = "tzdbNames.txt"
431    output_file = OutFile("zone/tzdbNames.res")
432    return [
433        SingleExecutionRequest(
434            name = "zone_supplemental_res",
435            category = "zone_supplemental",
436            dep_targets = [],
437            input_files = [input_file],
438            output_files = [output_file],
439            tool = IcuTool("genrb"),
440            args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} "
441                "-k "
442                "{INPUT_BASENAME}",
443            format_with = {
444                "INPUT_BASENAME": input_basename
445            }
446        )
447    ]
448
449
450def generate_translit(config, io, common_vars):
451    input_files = [
452        InFile("translit/root.txt"),
453        InFile("translit/en.txt"),
454        InFile("translit/el.txt")
455    ]
456    dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt"))
457    dep_files -= set(input_files)
458    dep_files = list(sorted(dep_files))
459    input_basenames = [v.filename[9:] for v in input_files]
460    output_files = [
461        OutFile("translit/%s.res" % v[:-4])
462        for v in input_basenames
463    ]
464    return [
465        RepeatedOrSingleExecutionRequest(
466            name = "translit_res",
467            category = "translit",
468            dep_targets = dep_files,
469            input_files = input_files,
470            output_files = output_files,
471            tool = IcuTool("genrb"),
472            args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} "
473                "-k "
474                "{INPUT_BASENAME}",
475            format_with = {
476            },
477            repeat_with = {
478                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
479            }
480        )
481    ]
482
483
484def generate_brkitr_lstm(config, io, common_vars):
485    input_files = [InFile(filename) for filename in io.glob("brkitr/lstm/*.txt")]
486    input_basenames = [v.filename[12:] for v in input_files]
487    output_files = [
488        OutFile("brkitr/%s.res" % v[:-4])
489        for v in input_basenames
490    ]
491    return [
492        RepeatedOrSingleExecutionRequest(
493            name = "lstm_res",
494            category = "brkitr_lstm",
495            dep_targets = [],
496            input_files = input_files,
497            output_files = output_files,
498            tool = IcuTool("genrb"),
499            args = "-s {IN_DIR}/brkitr/lstm -d {OUT_DIR}/brkitr -i {OUT_DIR} "
500                "-k "
501                "{INPUT_BASENAME}",
502            format_with = {
503            },
504            repeat_with = {
505                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
506            }
507        )
508    ]
509
510def generate_brkitr_adaboost(config, io, common_vars):
511    input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")]
512    input_basenames = [v.filename[16:] for v in input_files]
513    output_files = [
514        OutFile("brkitr/%s.res" % v[:-4])
515        for v in input_basenames
516    ]
517    return [
518        RepeatedOrSingleExecutionRequest(
519            name = "adaboost_res",
520            category = "brkitr_adaboost",
521            dep_targets = [],
522            input_files = input_files,
523            output_files = output_files,
524            tool = IcuTool("genrb"),
525            args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} "
526                "-k "
527                "{INPUT_BASENAME}",
528            format_with = {
529            },
530            repeat_with = {
531                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
532            }
533        )
534    ]
535
536def generate_tree(
537        config,
538        io,
539        common_vars,
540        sub_dir,
541        out_sub_dir,
542        use_pool_bundle,
543        dep_targets):
544    requests = []
545    category = "%s_tree" % sub_dir
546    out_prefix = "%s/" % out_sub_dir if out_sub_dir else ""
547    input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)]
548    if sub_dir == "curr":
549        input_files.remove(InFile("curr/supplementalData.txt"))
550    if sub_dir == "zone":
551        input_files.remove(InFile("zone/tzdbNames.txt"))
552    input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files]
553    output_files = [
554        OutFile("%s%s.res" % (out_prefix, v[:-4]))
555        for v in input_basenames
556    ]
557
558    # Generate Pool Bundle
559    if use_pool_bundle:
560        input_pool_files = [OutFile("%spool.res" % out_prefix)]
561        pool_target_name = "%s_pool_write" % sub_dir
562        use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format(
563            OUT_PREFIX = out_prefix,
564            **common_vars
565        )
566        requests += [
567            SingleExecutionRequest(
568                name = pool_target_name,
569                category = category,
570                dep_targets = dep_targets,
571                input_files = input_files,
572                output_files = input_pool_files,
573                tool = IcuTool("genrb"),
574                args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} "
575                    "--writePoolBundle -k "
576                    "{INPUT_BASENAMES_SPACED}",
577                format_with = {
578                    "IN_SUB_DIR": sub_dir,
579                    "OUT_PREFIX": out_prefix,
580                    "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames)
581                }
582            ),
583        ]
584        dep_targets = dep_targets + [DepTarget(pool_target_name)]
585    else:
586        use_pool_bundle_option = ""
587
588    # Generate Res File Tree
589    requests += [
590        RepeatedOrSingleExecutionRequest(
591            name = "%s_res" % sub_dir,
592            category = category,
593            dep_targets = dep_targets,
594            input_files = input_files,
595            output_files = output_files,
596            tool = IcuTool("genrb"),
597            args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} "
598                "{EXTRA_OPTION} -k "
599                "{INPUT_BASENAME}",
600            format_with = {
601                "IN_SUB_DIR": sub_dir,
602                "OUT_PREFIX": out_prefix,
603                "EXTRA_OPTION": use_pool_bundle_option
604            },
605            repeat_with = {
606                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
607            }
608        )
609    ]
610
611    # Generate res_index file
612    # Exclude the deprecated locale variants and root; see ICU-20628. This
613    # could be data-driven, but we do not want to perform I/O in this script
614    # (for example, we do not want to read from an XML file).
615    excluded_locales = set([
616        "ja_JP_TRADITIONAL",
617        "th_TH_TRADITIONAL",
618        "de_",
619        "de__PHONEBOOK",
620        "es_",
621        "es__TRADITIONAL",
622        "root",
623    ])
624    # Put alias locales in a separate structure; see ICU-20627
625    dependency_data = io.read_locale_deps(sub_dir)
626    if "aliases" in dependency_data:
627        alias_locales = set(dependency_data["aliases"].keys())
628    else:
629        alias_locales = set()
630    alias_files = []
631    installed_files = []
632    for f in input_files:
633        file_stem = IndexRequest.locale_file_stem(f)
634        if file_stem in excluded_locales:
635            continue
636        destination = alias_files if file_stem in alias_locales else installed_files
637        destination.append(f)
638    cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None
639    index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format(
640        IN_SUB_DIR = sub_dir,
641        **common_vars
642    ))
643    index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format(
644        OUT_PREFIX = out_prefix,
645        **common_vars
646    ))
647    index_file_target_name = "%s_index_txt" % sub_dir
648    requests += [
649        IndexRequest(
650            name = index_file_target_name,
651            category = category,
652            installed_files = installed_files,
653            alias_files = alias_files,
654            txt_file = index_file_txt,
655            output_file = index_res_file,
656            cldr_version = cldr_version,
657            args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} "
658                "-k "
659                "{INDEX_NAME}.txt",
660            format_with = {
661                "IN_SUB_DIR": sub_dir,
662                "OUT_PREFIX": out_prefix
663            }
664        )
665    ]
666
667    return requests
668