• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1<!-- © 2019 and later: Unicode, Inc. and others.
2     License & terms of use: http://www.unicode.org/copyright.html -->
3
4<!--================================================================================
5    Setup:
6    Follow the installation instructions in README.txt in this directory.
7
8    To build ICU data files:
9    1: Determine the CLDR base directory and set the CLDR_DIR environment variable.
10    2: Determine the flags required (see the list of properties below).
11    3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>...
12    ================================================================================-->
13<!-- TODO: Add things like copying of a template directory and deleting previous files
14     (perhaps always generate into a temporary directory and copy back to avoid having
15      inconsistent state when the conversion is cancelled). -->
16<project name="Convert" default="all" basedir="." xmlns:if="ant:if" xmlns:unless="ant:unless">
17
18    <target name="all" depends="init-args, prepare-jar, clean, convert"/>
19
20    <!-- Initialize the properties which were not already set on the command line. -->
21    <target name="init-args">
22        <property environment="env"/>
23        <!-- Inherit properties from environment variable unless specified. As usual
24             with Ant, this is messier than it should be. All we are saying here is:
25             "Use the property if explicitly set, otherwise use the environment variable."
26             We cannot just set the property to the environment variable, since expansion
27             fails for non existent properties, and you are left with a literal value of
28             "${env.CLDR_DATA_DIR}". -->
29        <condition property="cldrDataDir" value="${env.CLDR_DATA_DIR}">
30            <isset property="env.CLDR_DATA_DIR"/>
31        </condition>
32        <fail unless="cldrDataDir"
33              message="Set the CLDR_DATA_DIR environment variable (or cldrDataDir property) to the CLDR data directory (typically ending in '/production')"/>
34
35        <!-- Ant does not inherit this from the user's environment (and it can matter).
36             This is only needed because we have to "exec" a new Ant task below. -->
37        <condition property="javaHome" value="${env.JAVA_HOME}">
38            <isset property="env.JAVA_HOME"/>
39        </condition>
40
41        <!-- The output directory into which to write the converted ICU data. By default
42             this will overwrite (without deletion) the ICU data files in this ICU release,
43             so it is recommended that for testing, it be set to another value.  -->
44        <property name="outDir" value="${basedir}/../../../icu4c/source/data/"/>
45
46        <!-- The output directory into which to write generated C/C++ code.  By default
47             this will overwrite (without deletion) the generated C/C++ files in this
48             ICU release, so it is recommended that for testing, it be set to another value. -->
49        <property name="genCCodeDir" value="${basedir}/../../../icu4c/source/"/>
50
51        <!-- The output directory into which to write generated Java code.  By default
52             this will overwrite (without deletion) the generated Java files in this
53             ICU release, so it is recommended that for testing, it be set to another value. -->
54        <property name="genJavaCodeDir" value="${basedir}/../../../icu4j/main/classes/core"/>
55
56        <!-- Set this to true to prevent build-icu-data.xml from generating the generated
57             ICU source files -->
58        <property name="dontGenCode" value="false" />
59
60        <!-- The directory in which the additional ICU XML data is stored. -->
61        <property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/>
62
63        <!-- Default value for ICU version (icuver.txt). Update this for each release. -->
64        <property name="icuVersion" value="72.1.0.0"/>
65
66        <!-- Default value for ICU data version (icuver.txt). Update this for each release. -->
67        <property name="icuDataVersion" value="72.1.0.0"/>
68
69        <!-- An override for the CLDR version string (icuver.txt and others). This will be
70             extracted from the CLDR library used for building the data if not set here. -->
71        <property name="cldrVersion" value=""/>
72
73        <!-- The minimum draft status for CLDR data to be used in the conversion. See
74             CldrDraftStatus for more details. -->
75        <property name="minDraftStatus" value="contributed"/>
76
77        <!-- A regular expression to match the locale IDs to be generated (useful for
78             debugging specific regions). This is applied after locale ID specifications
79             have been expanded into full locale IDs, so the value "en" will NOT match
80             "en_GB" or "en_001" etc. -->
81        <property name="localeIdFilter" value=""/>
82
83        <!-- Whether to synthetically generate "pseudo locale" data ("en_XA" and "ar_XB"). -->
84        <property name="includePseudoLocales" value="false"/>
85
86        <!-- Whether to emit a debug report containing some possibly useful information after
87             the conversion has finished. -->
88        <!-- TODO: Currently this isn't hugely useful, so find out what people want. -->
89        <property name="emitReport" value="false"/>
90
91        <!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty
92             list means "build everything".
93
94             Note that the grouping of types is based on the legacy converter behaviour and
95             is not always directly associated with an output directory (e.g. "locales"
96             produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT
97             coll/, brkitr/ or rbnf/).
98
99             Pass in the value "HELP" (or any invalid value) to see the full list of types. -->
100        <!-- TODO: Find out what common use cases are and use them. -->
101        <property name="outputTypes" value=""/>
102
103        <!-- Override to force the 'clean' task to delete files it cannot determine to be
104             auto-generated by this tool. This is useful if the file header changes since
105             the heading is what's used to recognize auto-generated files. -->
106        <property name="forceDelete" value="false"/>
107    </target>
108
109    <!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
110         about making Ant know the Maven class-path). -->
111    <target name="prepare-jar" depends="init-args">
112        <exec executable="mvn" searchpath="true" failonerror="true">
113            <arg value="compile"/>
114        </exec>
115    </target>
116
117    <!-- Somewhat hacky wrapper target which invokes the real conversion task.
118         This is done so we can set the environment variable of the new process and
119         effectively overwrite the CLDR_DIR value. If ever the CLDR library doesn't
120         need to use CLDR_DIR at runtime to find the production data, this can all be
121         removed. -->
122    <target name="convert" depends="init-args, prepare-jar">
123        <exec executable="ant" searchpath="true" failonerror="true">
124            <!-- The CLDR library wants CLDR_DIR set, to the data directory. -->
125            <env key="CLDR_DIR" value="${cldrDataDir}" />
126            <!-- Force inherit JAVA_HOME (this can be important). -->
127            <env key="JAVA_HOME" value="${javaHome}" />
128            <!-- Initial Ant command line with all the "interesting" bit in. -->
129            <arg line="-f build-icu-data.xml convert-impl -DcldrDir=${cldrDataDir}"/>
130            <!-- List all properties in the "convert-impl" task (except cldrDir). -->
131            <arg value="-DoutDir=${outDir}"/>
132            <arg value="-DgenCCodeDir=${genCCodeDir}"/>
133            <arg value="-DgenJavaCodeDir=${genJavaCodeDir}"/>
134            <arg value="-DdontGenCode=${dontGenCode}"/>
135            <arg value="-DspecialsDir=${specialsDir}"/>
136            <arg value="-DoutputTypes=${outputTypes}"/>
137            <arg value="-DicuVersion=${icuVersion}"/>
138            <arg value="-DicuDataVersion=${icuDataVersion}"/>
139            <arg value="-DcldrVersion=${cldrVersion}"/>
140            <arg value="-DminDraftStatus=${minDraftStatus}"/>
141            <arg value="-DlocaleIdFilter=${localeIdFilter}"/>
142            <arg value="-DincludePseudoLocales=${includePseudoLocales}"/>
143            <arg value="-DemitReport=${emitReport}"/>
144        </exec>
145    </target>
146
147    <!-- Do the actual CLDR data conversion, based on the command line arguments, built in
148         default properties and the configuration in the "<convert>" element below. -->
149    <target name="convert-impl">
150        <taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask">
151            <classpath>
152                <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
153            </classpath>
154        </taskdef>
155        <taskdef name="generateCode" classname="org.unicode.icu.tool.cldrtoicu.ant.GenerateCodeTask">
156            <classpath>
157                <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
158            </classpath>
159        </taskdef>
160        <convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
161                 outputTypes="${outputTypes}" cldrVersion="${cldrVersion}"
162                 icuVersion="${icuVersion}" icuDataVersion="${icuDataVersion}"
163                 minimalDraftStatus="${minDraftStatus}" localeIdFilter="${localeIdFilter}"
164                 includePseudoLocales="${includePseudoLocales}" emitReport="${emitReport}">
165
166            <!-- The primary set of locale IDs to be generated by default. The IDs in this list are
167                 automatically expanded to include default scripts and all available regions. The
168                 rules are:
169
170                 1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
171                 2) All region and variant subtags are added for any base language or language+script
172                    (e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").
173
174                 If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").
175
176                 Locale IDs with deprecated subtags (which become aliases) must still be listed in
177                 full (e.g. "en_RH" or "sr_Latn_YU").
178            -->
179            <localeIds>
180                // A
181                af, agq, ak, am, ar, ars, as, asa, ast, az, az_AZ, az_Cyrl
182
183                // B
184                bas, be, bem, bez, bg, bgc, bho, bm, bn, bo, br, brx, bs, bs_BA, bs_Cyrl
185
186                // C
187                ca, ccp, ce, ceb, cgg, chr, ckb, cs, cv, cy
188
189                // D
190                da, dav, de, dje, doi, dsb, dua, dyo, dz
191
192                // E
193                ebu, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo
194
195                // F
196                fa, ff, ff_Adlm, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fo, fr, fur, fy
197
198                // G
199                ga, gd, gl, gsw, gu, guz, gv
200
201                // H
202                ha, haw, he, hi, hi_Latn, hr, hsb, hu, hy
203
204                // I
205                ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL
206
207                // J
208                ja, jgo, jmc, jv
209
210                // K
211                ka, kab, kam, kde, kea, kgp, khq, ki, kk, kkj, kl, kln, km, kn, ko, kok, ks
212                ks_Deva, ks_IN, ksb, ksf, ksh, ku, kw, ky
213
214                // L
215                lag, lb, lg, lkt, ln, lo, lrc, lt, lu, luo, luy, lv
216
217                // M
218                mai, mas, mer, mfe, mg, mgh, mgo, mi, mk, ml, mn, mni, mni_IN, mo, mr, ms
219                mt, mua, my, mzn
220
221                // N
222                naq, nb, nd, ne, nl, nmg, nn, nnh, no, no_NO, no_NO_NY, nus, nyn
223
224                // O
225                om, or, os
226
227                // P
228                pa, pa_Arab, pa_IN, pa_PK, pcm, pl, ps, pt
229
230                // Q
231                qu
232
233                // R
234                raj, rm, rn, ro, rof, ru, rw, rwk
235
236                // S
237                sa, sah, saq, sat, sat_IN, sbp, sc, sd, sd_Deva, sd_IN, sd_PK, se, seh, ses, sg, sh, sh_BA, sh_CS, sh_YU
238                shi, shi_Latn, shi_MA, si, sk, sl, smn, sn, so, sq, sr, sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn
239                sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, su, su_ID, sv, sw
240
241                // T
242                ta, te, teo, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, tzm
243
244                // U
245                ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ
246
247                // V
248                vai, vai_LR, vai_Latn, vi, vun
249
250                // W
251                wae, wo
252
253                // X
254                xh, xog
255
256                // Y
257                yav, yi, yo, yrl, yue, yue_CN, yue_HK, yue_Hans
258
259                // Z
260                zgh, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
261            </localeIds>
262
263            <!-- The following elements configure directories in which a subset of the available
264                 locales IDs should be generated. Unlike the main <localeId> element, these
265                 filters must specify all locale IDs in full (but since they mostly select base
266                 languages, this isn't a big deal).
267
268                 As well as allowing some data directories to have a subset of available data (via
269                 the <localeIds> element) there are also mechanisms for controlling aliasing and
270                 the locale parent relation which allows the sharing of some ICU data in cases
271                 where it would otherwise need to be copied. The two mechanisms are:
272
273                 1: inheritLanguageSubtag: Used to rewrite the parent of a locale ID from "root" to
274                    its language subtag (e.g. "zh_Hant" has a natural parent of "root", but to allow
275                    some base language data to be shared it can be made to have a parent of "zh").
276
277                 2: forcedAlias: Used to add aliases for specific directories in order to affect the
278                    ICU behaviour in special cases.
279
280                 Between them these mechanisms are known as "tailorings" of the affected locales. -->
281            <!-- TODO: Explain why these special cases are needed/different. -->
282
283            <!-- Collation data is large, but also more sharable than other data, which is why there
284                 are a number of aliases and parent remappings for this directory. -->
285            <directory dir="coll" inheritLanguageSubtag="bs_Cyrl, sr_Latn, zh_Hant">
286                <!-- These aliases are to avoid needing to copy and maintain the same collation data
287                     for "zh" and "yue". The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs
288                     "zh_Hans_CN"), and for "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the
289                     aliases are effectively just rewriting the base language. -->
290                <forcedAlias source="yue" target="zh_Hant"/>
291                <forcedAlias source="yue_Hant" target="zh_Hant"/>
292                <forcedAlias source="yue_CN" target="zh_Hans"/>
293                <forcedAlias source="yue_Hans" target="zh_Hans"/>
294                <forcedAlias source="yue_Hans_CN" target="zh_Hans"/>
295
296                <!-- TODO: Find out and document this properly. -->
297                <forcedAlias source="sr_ME" target="sr_Cyrl_ME"/>
298
299                <localeIds>
300                    root,
301
302                    // A-B
303                    af, am, ars, ar, as, az, be, bg, bn, bo, br, bs_Cyrl, bs,
304
305                    // C-F
306                    ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
307                    en_US_POSIX, en_US, eo, es, et, fa_AF, fa, ff_Adlm, ff, fil, fi, fo, fr_CA, fr, fy,
308
309                    // G-J
310                    ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
311                    id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
312
313                    // K-P
314                    ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
315                    mk, ml, mn, mo, mr, ms, mt, my, nb, nb_NO, ne, nl, nn, no, no_NO,
316                    om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
317
318                    // R-T
319                    ro, ru, sa, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
320                    sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
321                    ta, te, th, tk, to, tr,
322
323                    // U-Z
324                    ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans_CN, yue_Hans
325                    yue_Hant, yue, zh_CN, zh_Hans, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
326                </localeIds>
327            </directory>
328
329            <directory dir="rbnf">
330                <!-- It is not at all clear why this is being done. It's certainly not exactly the
331                     same as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with
332                     different data than "yue", so this alias is not just rewriting the base
333                     language. -->
334                <!-- TODO: Find out and document this properly. -->
335                <forcedAlias source="zh_Hant_HK" target="yue"/>
336
337                <localeIds>
338                    root,
339
340                    // A-E
341                    af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
342                    da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
343                    es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
344
345                    // F-P
346                    fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
347                    hu, hy, id, in, is, it, iw, ja, ka, kk, kl, km, ko, ky, lb,
348                    lo, lrc, lt, lv, mk, ms, mt, my, nb, ne, nl, nn, no, pl, pt_PT, pt,
349
350                    // Q-Z
351                    qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, su, sv, sw, ta, th, tr,
352                    uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
353                </localeIds>
354            </directory>
355
356            <directory dir="brkitr" inheritLanguageSubtag="zh_Hant">
357                <localeIds>
358                    root,
359                    de, el, en, en_US_POSIX, en_US, es, fi, fr, it, ja, pt, ru, sv, zh_Hant, zh
360                </localeIds>
361            </directory>
362
363            <!-- GLOBAL ALIASES -->
364
365            <!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
366                 (e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
367                 relationship. Unlike deprecated languages for which an alias can be inferred from
368                 the "languageAlias" CLDR data, there's no way in CLDR to represent the fact that
369                 we want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
370
371                 This alias is the first example of potentially many cases where ICU needs to
372                 generate an alias in order to affect "sideways inheritance" for spoken languages,
373                 and at some stage it should probably be supported properly in the CLDR data. -->
374            <forcedAlias source="ars" target="ar_SA"/>
375
376            <!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
377            <forcedAlias source="no_NO_NY" target="nn_NO"/>
378
379            <!-- This one is a bit silly, it is just to generate a stub for no_NO, which is
380                 not in CLDR. If we do not do this, then including it in localeIds will generate
381                 empty no_Latn and no_Latn_NO and then no_NO aliasing to no_Latn_NO. -->
382            <forcedAlias source="no_NO" target="no"/>
383
384            <!-- ALTERNATE VALUES -->
385
386            <!-- The following elements configure alternate values for some special case paths.
387                 The target path will only be replaced if both it, and the source path, exist in
388                 the CLDR data (paths will not be modified if only the source path exists).
389
390                 Since the paths must represent the same semantic type of data, they must be in the
391                 same "namespace" (same element names) and must not contain value attributes. Thus
392                 they can only differ by distinguishing attributes (either added or modified).
393
394                 This feature is typically used to select alternate translations (e.g. short forms)
395                 for certain paths. -->
396            <!-- <altPath target="//path/to/value[@attr='foo']"
397                          source="//path/to/value[@attr='bar']"
398                          locales="xx,yy_ZZ"/> -->
399        </convert>
400
401        <generateCode cldrDir="${cldrDir}" cOutDir="${genCCodeDir}" javaOutDir="${genJavaCodeDir}" unless:true="${dontGenCode}" />
402    </target>
403
404    <target name="clean" depends="init-args, prepare-jar">
405        <taskdef name="outputDirectories" classname="org.unicode.icu.tool.cldrtoicu.ant.CleanOutputDirectoryTask">
406            <classpath>
407                <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
408            </classpath>
409        </taskdef>
410        <taskdef name="generateCode" classname="org.unicode.icu.tool.cldrtoicu.ant.GenerateCodeTask">
411            <classpath>
412                <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
413            </classpath>
414        </taskdef>
415
416        <!-- If a directory is listed here, then every file in it is assumed to be automatically
417             generated by the conversion tool, unless it is explicitly listed in a <retain> element.
418             The tool then checks every file to determine if it has the expected header present,
419             indiciating that it was automatically generated, before deleting it.
420
421             If unexpected files are found, the "clean" task will fail without deleting anything
422             (unless'forceDelete' is set to override this). Note that even if 'forceDelete' is set,
423             the files listed explicitly below will never be deleted by this process.
424
425             This two-step approach minimizes the risk that the conversion process will ever
426             accidentally delete a manually maintained file.
427             -->
428        <outputDirectories root="${outDir}" forceDelete="${forceDelete}">
429            <dir name="brkitr">
430                <retain path="dictionaries"/>
431                <retain path="lstm"/>
432                <retain path="rules"/>
433            </dir>
434            <dir name="coll">
435                <!-- Legacy files whose file names aren't supported for automatic generation.
436                     Simple to maintain manually and unlikely to ever change again. -->
437                <retain path="de__PHONEBOOK.txt"/>
438                <retain path="de_.txt"/>
439                <retain path="es__TRADITIONAL.txt"/>
440                <retain path="es_.txt"/>
441            </dir>
442            <dir name="curr"/>
443            <dir name="lang"/>
444            <dir name="locales"/>
445            <dir name="misc">
446                <!-- Machine generated files produced by different tools.
447                     Possibly worth moving into the new LDML conversion tool one day. -->
448                <retain path="currencyNumericCodes.txt"/>
449                <retain path="zoneinfo64.txt"/>
450                <!-- Project file (not ICU data), unlikely to ever be auto-generated. -->
451                <retain path="icudata.rc"/>
452                <!-- Small high-level metadata file, stable and easy to maintain manually. -->
453                <retain path="icustd.txt"/>
454            </dir>
455            <dir name="rbnf"/>
456            <dir name="region"/>
457            <dir name="translit">
458                <!-- Small, easy to maintain, special case top-level files. -->
459                <retain path="en.txt"/>
460                <retain path="el.txt"/>
461            </dir>
462            <dir name="unit"/>
463            <dir name="zone">
464                <!-- Manually edited to support TZ database name compatibility. -->
465                <retain path="tzdbNames.txt"/>
466            </dir>
467        </outputDirectories>
468
469        <generateCode cOutDir="${genCCodeDir}" javaOutDir="${genJavaCodeDir}" action="clean" />
470    </target>
471</project>
472
473