1<!-- © 2019 and later: Unicode, Inc. and others. 2 License & terms of use: http://www.unicode.org/copyright.html --> 3 4<!--================================================================================ 5 Setup: 6 Follow the installation instructions in README.txt in this directory. 7 8 To build ICU data files: 9 1: Determine the CLDR base directory and set the CLDR_DIR environment variable. 10 2: Determine the flags required (see the list of properties below). 11 3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>... 12 ================================================================================--> 13<!-- TODO: Add things like copying of a template directory and deleting previous files 14 (perhaps always generate into a temporary directory and copy back to avoid having 15 inconsistent state when the conversion is cancelled). --> 16<project name="Convert" default="all" basedir="." xmlns:if="ant:if" xmlns:unless="ant:unless"> 17 18 <target name="all" depends="init-args, prepare-jar, clean, convert"/> 19 20 <!-- Initialize the properties which were not already set on the command line. --> 21 <target name="init-args"> 22 <property environment="env"/> 23 <!-- Inherit properties from environment variable unless specified. As usual 24 with Ant, this is messier than it should be. All we are saying here is: 25 "Use the property if explicitly set, otherwise use the environment variable." 26 We cannot just set the property to the environment variable, since expansion 27 fails for non existent properties, and you are left with a literal value of 28 "${env.CLDR_DATA_DIR}". --> 29 <condition property="cldrDataDir" value="${env.CLDR_DATA_DIR}"> 30 <isset property="env.CLDR_DATA_DIR"/> 31 </condition> 32 <fail unless="cldrDataDir" 33 message="Set the CLDR_DATA_DIR environment variable (or cldrDataDir property) to the CLDR data directory (typically ending in '/production')"/> 34 35 <!-- Ant does not inherit this from the user's environment (and it can matter). 36 This is only needed because we have to "exec" a new Ant task below. --> 37 <condition property="javaHome" value="${env.JAVA_HOME}"> 38 <isset property="env.JAVA_HOME"/> 39 </condition> 40 41 <!-- The output directory into which to write the converted ICU data. By default 42 this will overwrite (without deletion) the ICU data files in this ICU release, 43 so it is recommended that for testing, it be set to another value. --> 44 <property name="outDir" value="${basedir}/../../../icu4c/source/data/"/> 45 46 <!-- The output directory into which to write generated C/C++ code. By default 47 this will overwrite (without deletion) the generated C/C++ files in this 48 ICU release, so it is recommended that for testing, it be set to another value. --> 49 <property name="genCCodeDir" value="${basedir}/../../../icu4c/source/"/> 50 51 <!-- The output directory into which to write generated Java code. By default 52 this will overwrite (without deletion) the generated Java files in this 53 ICU release, so it is recommended that for testing, it be set to another value. --> 54 <property name="genJavaCodeDir" value="${basedir}/../../../icu4j/main/classes/core"/> 55 56 <!-- Set this to true to prevent build-icu-data.xml from generating the generated 57 ICU source files --> 58 <property name="dontGenCode" value="false" /> 59 60 <!-- The directory in which the additional ICU XML data is stored. --> 61 <property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/> 62 63 <!-- Default value for ICU version (icuver.txt). Update this for each release. --> 64 <property name="icuVersion" value="72.1.0.0"/> 65 66 <!-- Default value for ICU data version (icuver.txt). Update this for each release. --> 67 <property name="icuDataVersion" value="72.1.0.0"/> 68 69 <!-- An override for the CLDR version string (icuver.txt and others). This will be 70 extracted from the CLDR library used for building the data if not set here. --> 71 <property name="cldrVersion" value=""/> 72 73 <!-- The minimum draft status for CLDR data to be used in the conversion. See 74 CldrDraftStatus for more details. --> 75 <property name="minDraftStatus" value="contributed"/> 76 77 <!-- A regular expression to match the locale IDs to be generated (useful for 78 debugging specific regions). This is applied after locale ID specifications 79 have been expanded into full locale IDs, so the value "en" will NOT match 80 "en_GB" or "en_001" etc. --> 81 <property name="localeIdFilter" value=""/> 82 83 <!-- Whether to synthetically generate "pseudo locale" data ("en_XA" and "ar_XB"). --> 84 <property name="includePseudoLocales" value="false"/> 85 86 <!-- Whether to emit a debug report containing some possibly useful information after 87 the conversion has finished. --> 88 <!-- TODO: Currently this isn't hugely useful, so find out what people want. --> 89 <property name="emitReport" value="false"/> 90 91 <!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty 92 list means "build everything". 93 94 Note that the grouping of types is based on the legacy converter behaviour and 95 is not always directly associated with an output directory (e.g. "locales" 96 produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT 97 coll/, brkitr/ or rbnf/). 98 99 Pass in the value "HELP" (or any invalid value) to see the full list of types. --> 100 <!-- TODO: Find out what common use cases are and use them. --> 101 <property name="outputTypes" value=""/> 102 103 <!-- Override to force the 'clean' task to delete files it cannot determine to be 104 auto-generated by this tool. This is useful if the file header changes since 105 the heading is what's used to recognize auto-generated files. --> 106 <property name="forceDelete" value="false"/> 107 </target> 108 109 <!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess 110 about making Ant know the Maven class-path). --> 111 <target name="prepare-jar" depends="init-args"> 112 <exec executable="mvn" searchpath="true" failonerror="true"> 113 <arg value="compile"/> 114 </exec> 115 </target> 116 117 <!-- Somewhat hacky wrapper target which invokes the real conversion task. 118 This is done so we can set the environment variable of the new process and 119 effectively overwrite the CLDR_DIR value. If ever the CLDR library doesn't 120 need to use CLDR_DIR at runtime to find the production data, this can all be 121 removed. --> 122 <target name="convert" depends="init-args, prepare-jar"> 123 <exec executable="ant" searchpath="true" failonerror="true"> 124 <!-- The CLDR library wants CLDR_DIR set, to the data directory. --> 125 <env key="CLDR_DIR" value="${cldrDataDir}" /> 126 <!-- Force inherit JAVA_HOME (this can be important). --> 127 <env key="JAVA_HOME" value="${javaHome}" /> 128 <!-- Initial Ant command line with all the "interesting" bit in. --> 129 <arg line="-f build-icu-data.xml convert-impl -DcldrDir=${cldrDataDir}"/> 130 <!-- List all properties in the "convert-impl" task (except cldrDir). --> 131 <arg value="-DoutDir=${outDir}"/> 132 <arg value="-DgenCCodeDir=${genCCodeDir}"/> 133 <arg value="-DgenJavaCodeDir=${genJavaCodeDir}"/> 134 <arg value="-DdontGenCode=${dontGenCode}"/> 135 <arg value="-DspecialsDir=${specialsDir}"/> 136 <arg value="-DoutputTypes=${outputTypes}"/> 137 <arg value="-DicuVersion=${icuVersion}"/> 138 <arg value="-DicuDataVersion=${icuDataVersion}"/> 139 <arg value="-DcldrVersion=${cldrVersion}"/> 140 <arg value="-DminDraftStatus=${minDraftStatus}"/> 141 <arg value="-DlocaleIdFilter=${localeIdFilter}"/> 142 <arg value="-DincludePseudoLocales=${includePseudoLocales}"/> 143 <arg value="-DemitReport=${emitReport}"/> 144 </exec> 145 </target> 146 147 <!-- Do the actual CLDR data conversion, based on the command line arguments, built in 148 default properties and the configuration in the "<convert>" element below. --> 149 <target name="convert-impl"> 150 <taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask"> 151 <classpath> 152 <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/> 153 </classpath> 154 </taskdef> 155 <taskdef name="generateCode" classname="org.unicode.icu.tool.cldrtoicu.ant.GenerateCodeTask"> 156 <classpath> 157 <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/> 158 </classpath> 159 </taskdef> 160 <convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}" 161 outputTypes="${outputTypes}" cldrVersion="${cldrVersion}" 162 icuVersion="${icuVersion}" icuDataVersion="${icuDataVersion}" 163 minimalDraftStatus="${minDraftStatus}" localeIdFilter="${localeIdFilter}" 164 includePseudoLocales="${includePseudoLocales}" emitReport="${emitReport}"> 165 166 <!-- The primary set of locale IDs to be generated by default. The IDs in this list are 167 automatically expanded to include default scripts and all available regions. The 168 rules are: 169 170 1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn"). 171 2) All region and variant subtags are added for any base language or language+script 172 (e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA"). 173 174 If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn"). 175 176 Locale IDs with deprecated subtags (which become aliases) must still be listed in 177 full (e.g. "en_RH" or "sr_Latn_YU"). 178 --> 179 <localeIds> 180 // A 181 af, agq, ak, am, ar, ars, as, asa, ast, az, az_AZ, az_Cyrl 182 183 // B 184 bas, be, bem, bez, bg, bgc, bho, bm, bn, bo, br, brx, bs, bs_BA, bs_Cyrl 185 186 // C 187 ca, ccp, ce, ceb, cgg, chr, ckb, cs, cv, cy 188 189 // D 190 da, dav, de, dje, doi, dsb, dua, dyo, dz 191 192 // E 193 ebu, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo 194 195 // F 196 fa, ff, ff_Adlm, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fo, fr, fur, fy 197 198 // G 199 ga, gd, gl, gsw, gu, guz, gv 200 201 // H 202 ha, haw, he, hi, hi_Latn, hr, hsb, hu, hy 203 204 // I 205 ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL 206 207 // J 208 ja, jgo, jmc, jv 209 210 // K 211 ka, kab, kam, kde, kea, kgp, khq, ki, kk, kkj, kl, kln, km, kn, ko, kok, ks 212 ks_Deva, ks_IN, ksb, ksf, ksh, ku, kw, ky 213 214 // L 215 lag, lb, lg, lkt, ln, lo, lrc, lt, lu, luo, luy, lv 216 217 // M 218 mai, mas, mer, mfe, mg, mgh, mgo, mi, mk, ml, mn, mni, mni_IN, mo, mr, ms 219 mt, mua, my, mzn 220 221 // N 222 naq, nb, nd, ne, nl, nmg, nn, nnh, no, no_NO, no_NO_NY, nus, nyn 223 224 // O 225 om, or, os 226 227 // P 228 pa, pa_Arab, pa_IN, pa_PK, pcm, pl, ps, pt 229 230 // Q 231 qu 232 233 // R 234 raj, rm, rn, ro, rof, ru, rw, rwk 235 236 // S 237 sa, sah, saq, sat, sat_IN, sbp, sc, sd, sd_Deva, sd_IN, sd_PK, se, seh, ses, sg, sh, sh_BA, sh_CS, sh_YU 238 shi, shi_Latn, shi_MA, si, sk, sl, smn, sn, so, sq, sr, sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn 239 sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, su, su_ID, sv, sw 240 241 // T 242 ta, te, teo, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, tzm 243 244 // U 245 ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ 246 247 // V 248 vai, vai_LR, vai_Latn, vi, vun 249 250 // W 251 wae, wo 252 253 // X 254 xh, xog 255 256 // Y 257 yav, yi, yo, yrl, yue, yue_CN, yue_HK, yue_Hans 258 259 // Z 260 zgh, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu 261 </localeIds> 262 263 <!-- The following elements configure directories in which a subset of the available 264 locales IDs should be generated. Unlike the main <localeId> element, these 265 filters must specify all locale IDs in full (but since they mostly select base 266 languages, this isn't a big deal). 267 268 As well as allowing some data directories to have a subset of available data (via 269 the <localeIds> element) there are also mechanisms for controlling aliasing and 270 the locale parent relation which allows the sharing of some ICU data in cases 271 where it would otherwise need to be copied. The two mechanisms are: 272 273 1: inheritLanguageSubtag: Used to rewrite the parent of a locale ID from "root" to 274 its language subtag (e.g. "zh_Hant" has a natural parent of "root", but to allow 275 some base language data to be shared it can be made to have a parent of "zh"). 276 277 2: forcedAlias: Used to add aliases for specific directories in order to affect the 278 ICU behaviour in special cases. 279 280 Between them these mechanisms are known as "tailorings" of the affected locales. --> 281 <!-- TODO: Explain why these special cases are needed/different. --> 282 283 <!-- Collation data is large, but also more sharable than other data, which is why there 284 are a number of aliases and parent remappings for this directory. --> 285 <directory dir="coll" inheritLanguageSubtag="bs_Cyrl, sr_Latn, zh_Hant"> 286 <!-- These aliases are to avoid needing to copy and maintain the same collation data 287 for "zh" and "yue". The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs 288 "zh_Hans_CN"), and for "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the 289 aliases are effectively just rewriting the base language. --> 290 <forcedAlias source="yue" target="zh_Hant"/> 291 <forcedAlias source="yue_Hant" target="zh_Hant"/> 292 <forcedAlias source="yue_CN" target="zh_Hans"/> 293 <forcedAlias source="yue_Hans" target="zh_Hans"/> 294 <forcedAlias source="yue_Hans_CN" target="zh_Hans"/> 295 296 <!-- TODO: Find out and document this properly. --> 297 <forcedAlias source="sr_ME" target="sr_Cyrl_ME"/> 298 299 <localeIds> 300 root, 301 302 // A-B 303 af, am, ars, ar, as, az, be, bg, bn, bo, br, bs_Cyrl, bs, 304 305 // C-F 306 ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en, 307 en_US_POSIX, en_US, eo, es, et, fa_AF, fa, ff_Adlm, ff, fil, fi, fo, fr_CA, fr, fy, 308 309 // G-J 310 ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy, 311 id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja, 312 313 // K-P 314 ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv, 315 mk, ml, mn, mo, mr, ms, mt, my, nb, nb_NO, ne, nl, nn, no, no_NO, 316 om, or, pa_IN, pa, pa_Guru, pl, ps, pt, 317 318 // R-T 319 ro, ru, sa, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq, 320 sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw, 321 ta, te, th, tk, to, tr, 322 323 // U-Z 324 ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans_CN, yue_Hans 325 yue_Hant, yue, zh_CN, zh_Hans, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu 326 </localeIds> 327 </directory> 328 329 <directory dir="rbnf"> 330 <!-- It is not at all clear why this is being done. It's certainly not exactly the 331 same as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with 332 different data than "yue", so this alias is not just rewriting the base 333 language. --> 334 <!-- TODO: Find out and document this properly. --> 335 <forcedAlias source="zh_Hant_HK" target="yue"/> 336 337 <localeIds> 338 root, 339 340 // A-E 341 af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy, 342 da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO, 343 es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et, 344 345 // F-P 346 fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr, 347 hu, hy, id, in, is, it, iw, ja, ka, kk, kl, km, ko, ky, lb, 348 lo, lrc, lt, lv, mk, ms, mt, my, nb, ne, nl, nn, no, pl, pt_PT, pt, 349 350 // Q-Z 351 qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, su, sv, sw, ta, th, tr, 352 uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh 353 </localeIds> 354 </directory> 355 356 <directory dir="brkitr" inheritLanguageSubtag="zh_Hant"> 357 <localeIds> 358 root, 359 de, el, en, en_US_POSIX, en_US, es, fi, fr, it, ja, pt, ru, sv, zh_Hant, zh 360 </localeIds> 361 </directory> 362 363 <!-- GLOBAL ALIASES --> 364 365 <!-- Some spoken languages (e.g. "ars") inherit all their data from a written language 366 (e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that 367 relationship. Unlike deprecated languages for which an alias can be inferred from 368 the "languageAlias" CLDR data, there's no way in CLDR to represent the fact that 369 we want "ars" (a non-deprecated language) to inherit the data of "ar_SA". 370 371 This alias is the first example of potentially many cases where ICU needs to 372 generate an alias in order to affect "sideways inheritance" for spoken languages, 373 and at some stage it should probably be supported properly in the CLDR data. --> 374 <forcedAlias source="ars" target="ar_SA"/> 375 376 <!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). --> 377 <forcedAlias source="no_NO_NY" target="nn_NO"/> 378 379 <!-- This one is a bit silly, it is just to generate a stub for no_NO, which is 380 not in CLDR. If we do not do this, then including it in localeIds will generate 381 empty no_Latn and no_Latn_NO and then no_NO aliasing to no_Latn_NO. --> 382 <forcedAlias source="no_NO" target="no"/> 383 384 <!-- ALTERNATE VALUES --> 385 386 <!-- The following elements configure alternate values for some special case paths. 387 The target path will only be replaced if both it, and the source path, exist in 388 the CLDR data (paths will not be modified if only the source path exists). 389 390 Since the paths must represent the same semantic type of data, they must be in the 391 same "namespace" (same element names) and must not contain value attributes. Thus 392 they can only differ by distinguishing attributes (either added or modified). 393 394 This feature is typically used to select alternate translations (e.g. short forms) 395 for certain paths. --> 396 <!-- <altPath target="//path/to/value[@attr='foo']" 397 source="//path/to/value[@attr='bar']" 398 locales="xx,yy_ZZ"/> --> 399 </convert> 400 401 <generateCode cldrDir="${cldrDir}" cOutDir="${genCCodeDir}" javaOutDir="${genJavaCodeDir}" unless:true="${dontGenCode}" /> 402 </target> 403 404 <target name="clean" depends="init-args, prepare-jar"> 405 <taskdef name="outputDirectories" classname="org.unicode.icu.tool.cldrtoicu.ant.CleanOutputDirectoryTask"> 406 <classpath> 407 <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/> 408 </classpath> 409 </taskdef> 410 <taskdef name="generateCode" classname="org.unicode.icu.tool.cldrtoicu.ant.GenerateCodeTask"> 411 <classpath> 412 <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/> 413 </classpath> 414 </taskdef> 415 416 <!-- If a directory is listed here, then every file in it is assumed to be automatically 417 generated by the conversion tool, unless it is explicitly listed in a <retain> element. 418 The tool then checks every file to determine if it has the expected header present, 419 indiciating that it was automatically generated, before deleting it. 420 421 If unexpected files are found, the "clean" task will fail without deleting anything 422 (unless'forceDelete' is set to override this). Note that even if 'forceDelete' is set, 423 the files listed explicitly below will never be deleted by this process. 424 425 This two-step approach minimizes the risk that the conversion process will ever 426 accidentally delete a manually maintained file. 427 --> 428 <outputDirectories root="${outDir}" forceDelete="${forceDelete}"> 429 <dir name="brkitr"> 430 <retain path="dictionaries"/> 431 <retain path="lstm"/> 432 <retain path="rules"/> 433 </dir> 434 <dir name="coll"> 435 <!-- Legacy files whose file names aren't supported for automatic generation. 436 Simple to maintain manually and unlikely to ever change again. --> 437 <retain path="de__PHONEBOOK.txt"/> 438 <retain path="de_.txt"/> 439 <retain path="es__TRADITIONAL.txt"/> 440 <retain path="es_.txt"/> 441 </dir> 442 <dir name="curr"/> 443 <dir name="lang"/> 444 <dir name="locales"/> 445 <dir name="misc"> 446 <!-- Machine generated files produced by different tools. 447 Possibly worth moving into the new LDML conversion tool one day. --> 448 <retain path="currencyNumericCodes.txt"/> 449 <retain path="zoneinfo64.txt"/> 450 <!-- Project file (not ICU data), unlikely to ever be auto-generated. --> 451 <retain path="icudata.rc"/> 452 <!-- Small high-level metadata file, stable and easy to maintain manually. --> 453 <retain path="icustd.txt"/> 454 </dir> 455 <dir name="rbnf"/> 456 <dir name="region"/> 457 <dir name="translit"> 458 <!-- Small, easy to maintain, special case top-level files. --> 459 <retain path="en.txt"/> 460 <retain path="el.txt"/> 461 </dir> 462 <dir name="unit"/> 463 <dir name="zone"> 464 <!-- Manually edited to support TZ database name compatibility. --> 465 <retain path="tzdbNames.txt"/> 466 </dir> 467 </outputDirectories> 468 469 <generateCode cOutDir="${genCCodeDir}" javaOutDir="${genJavaCodeDir}" action="clean" /> 470 </target> 471</project> 472 473