• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (C) 2018 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html
3
4# Python 2/3 Compatibility (ICU-20299)
5# TODO(ICU-20301): Remove this.
6from __future__ import print_function
7
8from icutools.databuilder import *
9from icutools.databuilder import utils
10from icutools.databuilder.request_types import *
11
12import os
13import sys
14
15
16def generate(config, io, common_vars):
17    requests = []
18
19    if len(io.glob("misc/*")) == 0:
20        print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr)
21        exit(1)
22
23    requests += generate_cnvalias(config, io, common_vars)
24    requests += generate_ulayout(config, io, common_vars)
25    requests += generate_uemoji(config, io, common_vars)
26    requests += generate_confusables(config, io, common_vars)
27    requests += generate_conversion_mappings(config, io, common_vars)
28    requests += generate_brkitr_brk(config, io, common_vars)
29    requests += generate_brkitr_lstm(config, io, common_vars)
30    requests += generate_stringprep(config, io, common_vars)
31    requests += generate_brkitr_dictionaries(config, io, common_vars)
32    requests += generate_normalization(config, io, common_vars)
33    requests += generate_coll_ucadata(config, io, common_vars)
34    requests += generate_full_unicore_data(config, io, common_vars)
35    requests += generate_unames(config, io, common_vars)
36    requests += generate_misc(config, io, common_vars)
37    requests += generate_curr_supplemental(config, io, common_vars)
38    requests += generate_zone_supplemental(config, io, common_vars)
39    requests += generate_translit(config, io, common_vars)
40
41    # Res Tree Files
42    # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files)
43    requests += generate_tree(config, io, common_vars,
44        "locales",
45        None,
46        config.use_pool_bundle,
47        [])
48
49    requests += generate_tree(config, io, common_vars,
50        "curr",
51        "curr",
52        config.use_pool_bundle,
53        [])
54
55    requests += generate_tree(config, io, common_vars,
56        "lang",
57        "lang",
58        config.use_pool_bundle,
59        [])
60
61    requests += generate_tree(config, io, common_vars,
62        "region",
63        "region",
64        config.use_pool_bundle,
65        [])
66
67    requests += generate_tree(config, io, common_vars,
68        "zone",
69        "zone",
70        config.use_pool_bundle,
71        [])
72
73    requests += generate_tree(config, io, common_vars,
74        "unit",
75        "unit",
76        config.use_pool_bundle,
77        [])
78
79    requests += generate_tree(config, io, common_vars,
80        "coll",
81        "coll",
82        # Never use pool bundle for coll, brkitr, or rbnf
83        False,
84        # Depends on timezoneTypes.res and keyTypeData.res.
85        # TODO: We should not need this dependency to build collation.
86        # TODO: Bake keyTypeData.res into the common library?
87        [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")])
88
89    requests += generate_tree(config, io, common_vars,
90        "brkitr",
91        "brkitr",
92        # Never use pool bundle for coll, brkitr, or rbnf
93        False,
94        [DepTarget("brkitr_brk"), DepTarget("dictionaries")])
95
96    requests += generate_tree(config, io, common_vars,
97        "rbnf",
98        "rbnf",
99        # Never use pool bundle for coll, brkitr, or rbnf
100        False,
101        [])
102
103    requests += [
104        ListRequest(
105            name = "icudata_list",
106            variable_name = "icudata_all_output_files",
107            output_file = TmpFile("icudata.lst"),
108            include_tmp = False
109        )
110    ]
111
112    return requests
113
114
115def generate_cnvalias(config, io, common_vars):
116    # UConv Name Aliases
117    input_file = InFile("mappings/convrtrs.txt")
118    output_file = OutFile("cnvalias.icu")
119    return [
120        SingleExecutionRequest(
121            name = "cnvalias",
122            category = "cnvalias",
123            dep_targets = [],
124            input_files = [input_file],
125            output_files = [output_file],
126            tool = IcuTool("gencnval"),
127            args = "-s {IN_DIR} -d {OUT_DIR} "
128                "{INPUT_FILES[0]}",
129            format_with = {}
130        )
131    ]
132
133
134def generate_confusables(config, io, common_vars):
135    # CONFUSABLES
136    txt1 = InFile("unidata/confusables.txt")
137    txt2 = InFile("unidata/confusablesWholeScript.txt")
138    cfu = OutFile("confusables.cfu")
139    return [
140        SingleExecutionRequest(
141            name = "confusables",
142            category = "confusables",
143            dep_targets = [DepTarget("cnvalias")],
144            input_files = [txt1, txt2],
145            output_files = [cfu],
146            tool = IcuTool("gencfu"),
147            args = "-d {OUT_DIR} -i {OUT_DIR} "
148                "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} "
149                "-o {OUTPUT_FILES[0]}",
150            format_with = {}
151        )
152    ]
153
154
155def generate_conversion_mappings(config, io, common_vars):
156    # UConv Conversion Table Files
157    input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")]
158    output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files]
159    # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv
160    return [
161        RepeatedOrSingleExecutionRequest(
162            name = "conversion_mappings",
163            category = "conversion_mappings",
164            dep_targets = [],
165            input_files = input_files,
166            output_files = output_files,
167            tool = IcuTool("makeconv"),
168            args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}",
169            format_with = {},
170            repeat_with = {
171                "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files)
172            }
173        )
174    ]
175
176
177def generate_brkitr_brk(config, io, common_vars):
178    # BRK Files
179    input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")]
180    output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files]
181    return [
182        RepeatedExecutionRequest(
183            name = "brkitr_brk",
184            category = "brkitr_rules",
185            dep_targets =
186                [DepTarget("cnvalias"),
187                    DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
188            input_files = input_files,
189            output_files = output_files,
190            tool = IcuTool("genbrk"),
191            args = "-d {OUT_DIR} -i {OUT_DIR} "
192                "-c -r {IN_DIR}/{INPUT_FILE} "
193                "-o {OUTPUT_FILE}",
194            format_with = {},
195            repeat_with = {}
196        )
197    ]
198
199
200def generate_stringprep(config, io, common_vars):
201    # SPP FILES
202    input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")]
203    output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files]
204    bundle_names = [v.filename[6:-4] for v in input_files]
205    return [
206        RepeatedExecutionRequest(
207            name = "stringprep",
208            category = "stringprep",
209            dep_targets = [InFile("unidata/NormalizationCorrections.txt")],
210            input_files = input_files,
211            output_files = output_files,
212            tool = IcuTool("gensprep"),
213            args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} "
214                "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt",
215            format_with = {},
216            repeat_with = {
217                "BUNDLE_NAME": bundle_names
218            }
219        )
220    ]
221
222
223def generate_brkitr_dictionaries(config, io, common_vars):
224    # Dict Files
225    input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")]
226    output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files]
227    extra_options_map = {
228        "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000",
229        "brkitr/dictionaries/cjdict.txt": "--uchars",
230        "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780",
231        "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80",
232        "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00"
233    }
234    extra_optionses = [extra_options_map[v.filename] for v in input_files]
235    return [
236        RepeatedExecutionRequest(
237            name = "dictionaries",
238            category = "brkitr_dictionaries",
239            dep_targets = [],
240            input_files = input_files,
241            output_files = output_files,
242            tool = IcuTool("gendict"),
243            args = "-i {OUT_DIR} "
244                "-c {EXTRA_OPTIONS} "
245                "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}",
246            format_with = {},
247            repeat_with = {
248                "EXTRA_OPTIONS": extra_optionses
249            }
250        )
251    ]
252
253
254def generate_normalization(config, io, common_vars):
255    # NRM Files
256    input_files = [InFile(filename) for filename in io.glob("in/*.nrm")]
257    # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data
258    input_files.remove(InFile("in/nfc.nrm"))
259    output_files = [OutFile(v.filename[3:]) for v in input_files]
260    return [
261        RepeatedExecutionRequest(
262            name = "normalization",
263            category = "normalization",
264            dep_targets = [],
265            input_files = input_files,
266            output_files = output_files,
267            tool = IcuTool("icupkg"),
268            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}",
269            format_with = {},
270            repeat_with = {}
271        )
272    ]
273
274
275def generate_coll_ucadata(config, io, common_vars):
276    # Collation Dependency File (ucadata.icu)
277    input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type)
278    output_file = OutFile("coll/ucadata.icu")
279    return [
280        SingleExecutionRequest(
281            name = "coll_ucadata",
282            category = "coll_ucadata",
283            dep_targets = [],
284            input_files = [input_file],
285            output_files = [output_file],
286            tool = IcuTool("icupkg"),
287            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
288            format_with = {}
289        )
290    ]
291
292
293def generate_full_unicore_data(config, io, common_vars):
294    # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu)
295    # are hardcoded in the common DLL and therefore not included in the data package any more.
296    # They are not built by default but need to be built for ICU4J data,
297    # both in the .jar and in the .dat file (if ICU4J uses the .dat file).
298    # See ICU-4497.
299    if not config.include_uni_core_data:
300        return []
301
302    basenames = [
303        "pnames.icu",
304        "uprops.icu",
305        "ucase.icu",
306        "ubidi.icu",
307        "nfc.nrm"
308    ]
309    input_files = [InFile("in/%s" % bn) for bn in basenames]
310    output_files = [OutFile(bn) for bn in basenames]
311    return [
312        RepeatedExecutionRequest(
313            name = "unicore",
314            category = "unicore",
315            input_files = input_files,
316            output_files = output_files,
317            tool = IcuTool("icupkg"),
318            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}"
319        )
320    ]
321
322
323def generate_unames(config, io, common_vars):
324    # Unicode Character Names
325    input_file = InFile("in/unames.icu")
326    output_file = OutFile("unames.icu")
327    return [
328        SingleExecutionRequest(
329            name = "unames",
330            category = "unames",
331            dep_targets = [],
332            input_files = [input_file],
333            output_files = [output_file],
334            tool = IcuTool("icupkg"),
335            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
336            format_with = {}
337        )
338    ]
339
340
341def generate_ulayout(config, io, common_vars):
342    # Unicode text layout properties
343    basename = "ulayout"
344    input_file = InFile("in/%s.icu" % basename)
345    output_file = OutFile("%s.icu" % basename)
346    return [
347        SingleExecutionRequest(
348            name = basename,
349            category = basename,
350            dep_targets = [],
351            input_files = [input_file],
352            output_files = [output_file],
353            tool = IcuTool("icupkg"),
354            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
355            format_with = {}
356        )
357    ]
358
359
360def generate_uemoji(config, io, common_vars):
361    # Unicode emoji properties
362    basename = "uemoji"
363    input_file = InFile("in/%s.icu" % basename)
364    output_file = OutFile("%s.icu" % basename)
365    return [
366        SingleExecutionRequest(
367            name = basename,
368            category = basename,
369            dep_targets = [],
370            input_files = [input_file],
371            output_files = [output_file],
372            tool = IcuTool("icupkg"),
373            args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
374            format_with = {}
375        )
376    ]
377
378
379def generate_misc(config, io, common_vars):
380    # Misc Data Res Files
381    input_files = [InFile(filename) for filename in io.glob("misc/*.txt")]
382    input_basenames = [v.filename[5:] for v in input_files]
383    output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames]
384    return [
385        RepeatedExecutionRequest(
386            name = "misc_res",
387            category = "misc",
388            dep_targets = [DepTarget("cnvalias")], # ICU-21175
389            input_files = input_files,
390            output_files = output_files,
391            tool = IcuTool("genrb"),
392            args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} "
393                "-k -q "
394                "{INPUT_BASENAME}",
395            format_with = {},
396            repeat_with = {
397                "INPUT_BASENAME": input_basenames
398            }
399        )
400    ]
401
402
403def generate_curr_supplemental(config, io, common_vars):
404    # Currency Supplemental Res File
405    input_file = InFile("curr/supplementalData.txt")
406    input_basename = "supplementalData.txt"
407    output_file = OutFile("curr/supplementalData.res")
408    return [
409        SingleExecutionRequest(
410            name = "curr_supplemental_res",
411            category = "curr_supplemental",
412            dep_targets = [],
413            input_files = [input_file],
414            output_files = [output_file],
415            tool = IcuTool("genrb"),
416            args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} "
417                "-k "
418                "{INPUT_BASENAME}",
419            format_with = {
420                "INPUT_BASENAME": input_basename
421            }
422        )
423    ]
424
425
426def generate_zone_supplemental(config, io, common_vars):
427    # tzdbNames Res File
428    input_file = InFile("zone/tzdbNames.txt")
429    input_basename = "tzdbNames.txt"
430    output_file = OutFile("zone/tzdbNames.res")
431    return [
432        SingleExecutionRequest(
433            name = "zone_supplemental_res",
434            category = "zone_supplemental",
435            dep_targets = [],
436            input_files = [input_file],
437            output_files = [output_file],
438            tool = IcuTool("genrb"),
439            args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} "
440                "-k "
441                "{INPUT_BASENAME}",
442            format_with = {
443                "INPUT_BASENAME": input_basename
444            }
445        )
446    ]
447
448
449def generate_translit(config, io, common_vars):
450    input_files = [
451        InFile("translit/root.txt"),
452        InFile("translit/en.txt"),
453        InFile("translit/el.txt")
454    ]
455    dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt"))
456    dep_files -= set(input_files)
457    dep_files = list(sorted(dep_files))
458    input_basenames = [v.filename[9:] for v in input_files]
459    output_files = [
460        OutFile("translit/%s.res" % v[:-4])
461        for v in input_basenames
462    ]
463    return [
464        RepeatedOrSingleExecutionRequest(
465            name = "translit_res",
466            category = "translit",
467            dep_targets = dep_files,
468            input_files = input_files,
469            output_files = output_files,
470            tool = IcuTool("genrb"),
471            args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} "
472                "-k "
473                "{INPUT_BASENAME}",
474            format_with = {
475            },
476            repeat_with = {
477                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
478            }
479        )
480    ]
481
482
483def generate_brkitr_lstm(config, io, common_vars):
484    input_files = [InFile(filename) for filename in io.glob("brkitr/lstm/*.txt")]
485    input_basenames = [v.filename[12:] for v in input_files]
486    output_files = [
487        OutFile("brkitr/%s.res" % v[:-4])
488        for v in input_basenames
489    ]
490    return [
491        RepeatedOrSingleExecutionRequest(
492            name = "lstm_res",
493            category = "brkitr_lstm",
494            dep_targets = [],
495            input_files = input_files,
496            output_files = output_files,
497            tool = IcuTool("genrb"),
498            args = "-s {IN_DIR}/brkitr/lstm -d {OUT_DIR}/brkitr -i {OUT_DIR} "
499                "-k "
500                "{INPUT_BASENAME}",
501            format_with = {
502            },
503            repeat_with = {
504                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
505            }
506        )
507    ]
508
509def generate_tree(
510        config,
511        io,
512        common_vars,
513        sub_dir,
514        out_sub_dir,
515        use_pool_bundle,
516        dep_targets):
517    requests = []
518    category = "%s_tree" % sub_dir
519    out_prefix = "%s/" % out_sub_dir if out_sub_dir else ""
520    input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)]
521    if sub_dir == "curr":
522        input_files.remove(InFile("curr/supplementalData.txt"))
523    if sub_dir == "zone":
524        input_files.remove(InFile("zone/tzdbNames.txt"))
525    input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files]
526    output_files = [
527        OutFile("%s%s.res" % (out_prefix, v[:-4]))
528        for v in input_basenames
529    ]
530
531    # Generate Pool Bundle
532    if use_pool_bundle:
533        input_pool_files = [OutFile("%spool.res" % out_prefix)]
534        pool_target_name = "%s_pool_write" % sub_dir
535        use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format(
536            OUT_PREFIX = out_prefix,
537            **common_vars
538        )
539        requests += [
540            SingleExecutionRequest(
541                name = pool_target_name,
542                category = category,
543                dep_targets = dep_targets,
544                input_files = input_files,
545                output_files = input_pool_files,
546                tool = IcuTool("genrb"),
547                args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} "
548                    "--writePoolBundle -k "
549                    "{INPUT_BASENAMES_SPACED}",
550                format_with = {
551                    "IN_SUB_DIR": sub_dir,
552                    "OUT_PREFIX": out_prefix,
553                    "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames)
554                }
555            ),
556        ]
557        dep_targets = dep_targets + [DepTarget(pool_target_name)]
558    else:
559        use_pool_bundle_option = ""
560
561    # Generate Res File Tree
562    requests += [
563        RepeatedOrSingleExecutionRequest(
564            name = "%s_res" % sub_dir,
565            category = category,
566            dep_targets = dep_targets,
567            input_files = input_files,
568            output_files = output_files,
569            tool = IcuTool("genrb"),
570            args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} "
571                "{EXTRA_OPTION} -k "
572                "{INPUT_BASENAME}",
573            format_with = {
574                "IN_SUB_DIR": sub_dir,
575                "OUT_PREFIX": out_prefix,
576                "EXTRA_OPTION": use_pool_bundle_option
577            },
578            repeat_with = {
579                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
580            }
581        )
582    ]
583
584    # Generate res_index file
585    # Exclude the deprecated locale variants and root; see ICU-20628. This
586    # could be data-driven, but we do not want to perform I/O in this script
587    # (for example, we do not want to read from an XML file).
588    excluded_locales = set([
589        "ja_JP_TRADITIONAL",
590        "th_TH_TRADITIONAL",
591        "de_",
592        "de__PHONEBOOK",
593        "es_",
594        "es__TRADITIONAL",
595        "root",
596    ])
597    # Put alias locales in a separate structure; see ICU-20627
598    dependency_data = io.read_locale_deps(sub_dir)
599    if "aliases" in dependency_data:
600        alias_locales = set(dependency_data["aliases"].keys())
601    else:
602        alias_locales = set()
603    alias_files = []
604    installed_files = []
605    for f in input_files:
606        file_stem = IndexRequest.locale_file_stem(f)
607        if file_stem in excluded_locales:
608            continue
609        destination = alias_files if file_stem in alias_locales else installed_files
610        destination.append(f)
611    cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None
612    index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format(
613        IN_SUB_DIR = sub_dir,
614        **common_vars
615    ))
616    index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format(
617        OUT_PREFIX = out_prefix,
618        **common_vars
619    ))
620    index_file_target_name = "%s_index_txt" % sub_dir
621    requests += [
622        IndexRequest(
623            name = index_file_target_name,
624            category = category,
625            installed_files = installed_files,
626            alias_files = alias_files,
627            txt_file = index_file_txt,
628            output_file = index_res_file,
629            cldr_version = cldr_version,
630            args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} "
631                "-k "
632                "{INDEX_NAME}.txt",
633            format_with = {
634                "IN_SUB_DIR": sub_dir,
635                "OUT_PREFIX": out_prefix
636            }
637        )
638    ]
639
640    return requests
641