1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.util.Arrays; 8 import java.util.BitSet; 9 import java.util.Collection; 10 import java.util.Comparator; 11 import java.util.HashMap; 12 import java.util.HashSet; 13 import java.util.LinkedHashSet; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Map.Entry; 17 import java.util.Set; 18 import java.util.TreeMap; 19 import java.util.TreeSet; 20 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.draft.ScriptMetadata; 23 import org.unicode.cldr.draft.ScriptMetadata.Info; 24 import org.unicode.cldr.util.Builder; 25 import org.unicode.cldr.util.CLDRFile; 26 import org.unicode.cldr.util.CLDRLocale; 27 import org.unicode.cldr.util.CLDRPaths; 28 import org.unicode.cldr.util.CldrUtility; 29 import org.unicode.cldr.util.Containment; 30 import org.unicode.cldr.util.Counter; 31 import org.unicode.cldr.util.Factory; 32 import org.unicode.cldr.util.Iso639Data; 33 import org.unicode.cldr.util.Iso639Data.Scope; 34 import org.unicode.cldr.util.LanguageTagParser; 35 import org.unicode.cldr.util.LocaleIDParser; 36 import org.unicode.cldr.util.Log; 37 import org.unicode.cldr.util.PatternCache; 38 import org.unicode.cldr.util.SimpleFactory; 39 import org.unicode.cldr.util.StandardCodes; 40 import org.unicode.cldr.util.SupplementalDataInfo; 41 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 42 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 43 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 44 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 45 46 import com.google.common.collect.ImmutableMap; 47 import com.google.common.collect.ImmutableSet; 48 import com.ibm.icu.dev.util.CollectionUtilities; 49 import com.ibm.icu.impl.Relation; 50 import com.ibm.icu.impl.Row; 51 import com.ibm.icu.impl.Row.R2; 52 import com.ibm.icu.impl.Row.R3; 53 import com.ibm.icu.impl.Row.R4; 54 import com.ibm.icu.lang.UScript; 55 import com.ibm.icu.text.Collator; 56 import com.ibm.icu.text.NumberFormat; 57 import com.ibm.icu.text.UTF16; 58 import com.ibm.icu.text.UnicodeSet; 59 import com.ibm.icu.text.UnicodeSetIterator; 60 import com.ibm.icu.util.ULocale; 61 62 /** 63 * Problems: 64 * "und_Hani", "zh_Hani" 65 * "und_Sinh", "si_Sinh" 66 * 67 * @author markdavis 68 * 69 */ 70 public class GenerateMaximalLocales { 71 72 private static final String TEMP_UNKNOWN_REGION = "XZ"; 73 74 private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; 75 76 private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false); 77 private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); 78 private static final boolean SHOW_CONTAINERS = false; 79 80 enum OutputStyle { 81 PLAINTEXT, C, C_ALT, XML 82 }; 83 84 private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML") 85 .toUpperCase()); 86 87 // set based on above 88 private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR 89 : "\t"; 90 private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; 91 // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT; 92 93 private static final boolean tryDifferent = true; 94 95 private static final File list[] = { 96 new File(CLDRPaths.MAIN_DIRECTORY), 97 new File(CLDRPaths.SEED_DIRECTORY), 98 new File(CLDRPaths.EXEMPLARS_DIRECTORY) }; 99 100 private static Factory factory = SimpleFactory.make(list, ".*"); 101 private static SupplementalDataInfo supplementalData = SupplementalDataInfo 102 .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); 103 private static StandardCodes standardCodes = StandardCodes.make(); 104 private static CLDRFile english = factory.make("en", false); 105 static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 106 static { 107 for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { 108 String region = locale.getCountry(); 109 if (region == null || region.isEmpty() || Containment.isLeaf(region)) { 110 continue; 111 } cldrContainerToLanguages.put(region, locale.getLanguage())112 cldrContainerToLanguages.put(region, locale.getLanguage()); 113 } cldrContainerToLanguages.freeze()114 cldrContainerToLanguages.freeze(); 115 System.out.println("Keep containers " + cldrContainerToLanguages); 116 } 117 118 private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET"); 119 private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr"); 120 121 /** 122 * This is the simplest way to override, by supplying the max value. 123 * It gets a very low weight, so doesn't override any stronger value. 124 */ 125 private static final String[] MAX_ADDITIONS = new String[] { 126 "bss_Latn_CM", 127 "gez_Ethi_ET", 128 "ken_Latn_CM", 129 "und_Arab_PK", 130 "wa_Latn_BE", 131 132 "fub_Arab_CM", 133 "fuf_Latn_GN", 134 "kby_Arab_NE", 135 "kdh_Arab_TG", 136 "apd_Arab_TG", 137 "zlm_Latn_TG", 138 139 "cr_Cans_CA", 140 "hif_Latn_FJ", 141 "gon_Telu_IN", 142 "lzz_Latn_TR", 143 "lif_Deva_NP", 144 "unx_Beng_IN", 145 "unr_Beng_IN", 146 "ttt_Latn_AZ", 147 "pnt_Grek_GR", 148 "tly_Latn_AZ", 149 "tkr_Latn_AZ", 150 "bsq_Bass_LR", 151 "ccp_Cakm_BD", 152 "blt_Tavt_VN", 153 "rhg_Arab_MM", 154 "rhg_Rohg_MM", 155 }; 156 157 /** 158 * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS. 159 * However, if you add, add both the language and language+script mappings. 160 */ 161 // Many of the overrides below can be removed once the language/pop/country data is updated. 162 private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] { 163 { "eo", "eo_Latn_001" }, 164 { "eo_Latn", "eo_Latn_001" }, 165 { "es", "es_Latn_ES" }, 166 { "es_Latn", "es_Latn_ES" }, 167 { "ff_BF", "ff_Latn_BF" }, 168 { "ff_GM", "ff_Latn_GM" }, 169 { "ff_GH", "ff_Latn_GH" }, 170 { "ff_GW", "ff_Latn_GW" }, 171 { "ff_LR", "ff_Latn_LR" }, 172 { "ff_NE", "ff_Latn_NE" }, 173 { "ff_NG", "ff_Latn_NG" }, 174 { "ff_SL", "ff_Latn_SL" }, 175 { "ff_Adlm", "ff_Adlm_GN" }, 176 { "ia", "ia_Latn_001" }, 177 { "ia_Latn", "ia_Latn_001" }, 178 { "io", "io_Latn_001" }, 179 { "io_Latn", "io_Latn_001" }, 180 { "jbo", "jbo_Latn_001" }, 181 { "jbo_Latn", "jbo_Latn_001" }, 182 { "ku_Arab", "ku_Arab_IQ" }, 183 { "lrc", "lrc_Arab_IR" }, 184 { "lrc_Arab", "lrc_Arab_IR" }, 185 { "man", "man_Latn_GM" }, 186 { "man_Latn", "man_Latn_GM" }, 187 { "mas", "mas_Latn_KE" }, 188 { "mas_Latn", "mas_Latn_KE" }, 189 { "mn", "mn_Cyrl_MN" }, 190 { "mn_Cyrl", "mn_Cyrl_MN" }, 191 { "mro", "mro_Mroo_BD" }, 192 { "mro_BD", "mro_Mroo_BD" }, 193 { "ms_Arab", "ms_Arab_MY" }, 194 { "pap", "pap_Latn_AW" }, 195 { "pap_Latn", "pap_Latn_AW" }, 196 { "prg", "prg_Latn_001" }, 197 { "prg_Latn", "prg_Latn_001" }, 198 { "rif", "rif_Tfng_MA" }, 199 { "rif_Latn", "rif_Latn_MA" }, 200 { "rif_Tfng", "rif_Tfng_MA" }, 201 { "rif_MA", "rif_Tfng_MA" }, 202 { "shi", "shi_Tfng_MA" }, 203 { "shi_Tfng", "shi_Tfng_MA" }, 204 { "shi_MA", "shi_Tfng_MA" }, 205 { "sr_Latn", "sr_Latn_RS" }, 206 { "ss", "ss_Latn_ZA" }, 207 { "ss_Latn", "ss_Latn_ZA" }, 208 { "swc", "swc_Latn_CD" }, 209 { "ti", "ti_Ethi_ET" }, 210 { "ti_Ethi", "ti_Ethi_ET" }, 211 { "und", "en_Latn_US" }, 212 { "und_Adlm", "ff_Adlm_GN" }, 213 { "und_Adlm_GN", "ff_Adlm_GN" }, 214 { "und_Arab", "ar_Arab_EG" }, 215 { "und_Arab_PK", "ur_Arab_PK" }, 216 { "und_Bopo", "zh_Bopo_TW" }, 217 { "und_Deva_FJ", "hif_Deva_FJ" }, 218 { "und_EZ", "de_Latn_EZ" }, 219 { "und_Hani", "zh_Hani_CN" }, 220 { "und_Hani_CN", "zh_Hani_CN" }, 221 { "und_Kana", "ja_Kana_JP" }, 222 { "und_Kana_JP", "ja_Kana_JP" }, 223 { "und_Latn", "en_Latn_US" }, 224 { "und_Latn_ET", "en_Latn_ET" }, 225 { "und_Latn_NE", "ha_Latn_NE" }, 226 { "und_Latn_PH", "fil_Latn_PH" }, 227 { "und_ML", "bm_Latn_ML" }, 228 { "und_Latn_ML", "bm_Latn_ML" }, 229 { "und_MU", "mfe_Latn_MU" }, 230 { "und_NE", "ha_Latn_NE" }, 231 { "und_PH", "fil_Latn_PH" }, 232 { "und_PK", "ur_Arab_PK" }, 233 { "und_SO", "so_Latn_SO" }, 234 { "und_SS", "en_Latn_SS" }, 235 { "und_TK", "tkl_Latn_TK" }, 236 { "und_UN", "en_Latn_UN" }, 237 { "vo", "vo_Latn_001" }, 238 { "vo_Latn", "vo_Latn_001" }, 239 { "yi", "yi_Hebr_001" }, 240 { "yi_Hebr", "yi_Hebr_001" }, 241 { "yue", "yue_Hant_HK" }, 242 { "yue_Hant", "yue_Hant_HK" }, 243 { "yue_Hans", "yue_Hans_CN" }, 244 { "yue_CN", "yue_Hans_CN" }, 245 { "zh_Hani", "zh_Hani_CN" }, 246 247 { "zh_Bopo", "zh_Bopo_TW" }, 248 { "ccp", "ccp_Cakm_BD" }, 249 { "ccp_Cakm", "ccp_Cakm_BD" }, 250 { "und_Cakm", "ccp_Cakm_BD" }, 251 { "cu_Glag", "cu_Glag_BG" }, 252 { "sd_Khoj", "sd_Khoj_IN" }, 253 { "lif_Limb", "lif_Limb_IN" }, 254 { "grc_Linb", "grc_Linb_GR" }, 255 { "arc_Nbat", "arc_Nbat_JO" }, 256 { "arc_Palm", "arc_Palm_SY" }, 257 { "pal_Phlp", "pal_Phlp_CN" }, 258 { "en_Shaw", "en_Shaw_GB" }, 259 { "sd_Sind", "sd_Sind_IN" }, 260 { "und_Brai", "fr_Brai_FR" }, // hack 261 { "und_Hanb", "zh_Hanb_TW" }, // Special script code 262 { "zh_Hanb", "zh_Hanb_TW" }, // Special script code 263 { "und_Jamo", "ko_Jamo_KR" }, // Special script code 264 265 //{"und_Cyrl_PL", "be_Cyrl_PL"}, 266 267 // {"cr", "cr_Cans_CA"}, 268 // {"hif", "hif_Latn_FJ"}, 269 // {"gon", "gon_Telu_IN"}, 270 // {"lzz", "lzz_Latn_TR"}, 271 // {"lif", "lif_Deva_NP"}, 272 // {"unx", "unx_Beng_IN"}, 273 // {"unr", "unr_Beng_IN"}, 274 // {"ttt", "ttt_Latn_AZ"}, 275 // {"pnt", "pnt_Grek_GR"}, 276 // {"tly", "tly_Latn_AZ"}, 277 // {"tkr", "tkr_Latn_AZ"}, 278 // {"bsq", "bsq_Bass_LR"}, 279 // {"ccp", "ccp_Cakm_BD"}, 280 // {"blt", "blt_Tavt_VN"}, 281 { "mis_Medf", "mis_Medf_NG" }, 282 }); 283 284 /** 285 * The following supplements the suppress-script. It overrides info from exemplars and the locale info. 286 */ 287 private static String[][] SpecialScripts = { 288 { "zh", "Hans" }, // Hans (not Hani) 289 { "yue", "Hant" }, // Hans (not Hani) 290 { "chk", "Latn" }, // Chuukese (Micronesia) 291 { "fil", "Latn" }, // Filipino (Philippines)" 292 { "ko", "Kore" }, // Korean (North Korea) 293 { "ko_KR", "Kore" }, // Korean (North Korea) 294 { "pap", "Latn" }, // Papiamento (Netherlands Antilles) 295 { "pau", "Latn" }, // Palauan (Palau) 296 { "su", "Latn" }, // Sundanese (Indonesia) 297 { "tet", "Latn" }, // Tetum (East Timor) 298 { "tk", "Latn" }, // Turkmen (Turkmenistan) 299 { "ty", "Latn" }, // Tahitian (French Polynesia) 300 { "ja", "Jpan" }, // Special script for japan 301 { "und", "Latn" }, // Ultimate fallback 302 }; 303 304 private static Map<String, String> localeToScriptCache = new TreeMap<String, String>(); 305 static { 306 for (String language : standardCodes.getAvailableCodes("language")) { 307 Map<String, String> info = standardCodes.getLangData("language", language); 308 String script = info.get("Suppress-Script"); 309 if (script != null) { localeToScriptCache.put(language, script)310 localeToScriptCache.put(language, script); 311 } 312 } 313 for (String[] pair : SpecialScripts) { localeToScriptCache.put(pair[0], pair[1])314 localeToScriptCache.put(pair[0], pair[1]); 315 } 316 } 317 318 private static Map<String, String> FALLBACK_SCRIPTS; 319 static { 320 LanguageTagParser additionLtp = new LanguageTagParser(); 321 Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>(); 322 for (String addition : MAX_ADDITIONS) { 323 additionLtp.set(addition); 324 String lan = additionLtp.getLanguage(); _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())325 _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript()); 326 } 327 FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS); 328 } 329 330 private static int errorCount; 331 main(String[] args)332 public static void main(String[] args) throws IOException { 333 334 printDefaultLanguagesAndScripts(); 335 336 Map<String, String> toMaximized = new TreeMap<String, String>(); 337 338 tryDifferentAlgorithm(toMaximized); 339 340 minimize(toMaximized); 341 342 // HACK TEMP_UNKNOWN_REGION 343 // this is to get around the removal of items with ZZ in minimize. 344 // probably cleaner way to do it, but this provides control over just those we want to retain. 345 Set<String> toRemove = new TreeSet<>(); 346 Map<String, String> toFix = new TreeMap<>(); 347 for (Entry<String, String> entry : toMaximized.entrySet()) { 348 String key = entry.getKey(); 349 String value = entry.getValue(); 350 if (key.contains(TEMP_UNKNOWN_REGION)) { 351 toRemove.add(key); 352 } else if (value.contains(TEMP_UNKNOWN_REGION)) { 353 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION)); 354 } 355 } 356 for (String key : toRemove) { 357 toMaximized.remove(key); 358 } 359 toMaximized.putAll(toFix); 360 361 Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags(); 362 Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab", 363 "ms_Arab_ID"); 364 System.out.println(CollectionUtilities.join(changes, "\n")); 365 366 if (OUTPUT_STYLE == OutputStyle.C_ALT) { 367 doAlt(toMaximized); 368 } 369 370 if (SHOW_ADD) 371 System.out 372 .println("/*" 373 + CldrUtility.LINE_SEPARATOR 374 + " To Maximize:" 375 + 376 CldrUtility.LINE_SEPARATOR 377 + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing." 378 + 379 CldrUtility.LINE_SEPARATOR 380 + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'." 381 + 382 CldrUtility.LINE_SEPARATOR 383 + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions" 384 + 385 CldrUtility.LINE_SEPARATOR 386 + " Try each of the following in order (where the field exists)" 387 + 388 CldrUtility.LINE_SEPARATOR 389 + " Lookup language-script-region. If in the table, return the result + variants" 390 + 391 CldrUtility.LINE_SEPARATOR 392 + " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants" 393 + 394 CldrUtility.LINE_SEPARATOR 395 + " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants" 396 + 397 CldrUtility.LINE_SEPARATOR 398 + " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants" 399 + 400 CldrUtility.LINE_SEPARATOR 401 + 402 CldrUtility.LINE_SEPARATOR 403 + " Example: Input is zh-ZZZZ-SG." 404 + 405 CldrUtility.LINE_SEPARATOR 406 + " Normalize to zh-SG. Lookup in table. No match." 407 + 408 CldrUtility.LINE_SEPARATOR 409 + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG." 410 + 411 CldrUtility.LINE_SEPARATOR 412 + 413 CldrUtility.LINE_SEPARATOR 414 + " To Minimize:" 415 + 416 CldrUtility.LINE_SEPARATOR 417 + " First get max = maximize(input)." 418 + 419 CldrUtility.LINE_SEPARATOR 420 + " Then for trial in {language, language-region, language-script}" 421 + 422 CldrUtility.LINE_SEPARATOR 423 + " If maximize(trial) == max, then return trial." 424 + 425 CldrUtility.LINE_SEPARATOR 426 + " If you don't get a match, return max." 427 + 428 CldrUtility.LINE_SEPARATOR 429 + 430 CldrUtility.LINE_SEPARATOR 431 + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW." 432 + 433 CldrUtility.LINE_SEPARATOR 434 + " zh => zh-Hans-CN. No match, so continue." 435 + 436 CldrUtility.LINE_SEPARATOR 437 + " zh-TW => zh-Hans-TW. Match, so return zh-TW." 438 + 439 CldrUtility.LINE_SEPARATOR 440 + 441 CldrUtility.LINE_SEPARATOR 442 + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language." 443 + 444 CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() + 445 CldrUtility.LINE_SEPARATOR + "*/"); 446 447 printLikelySubtags(toMaximized); 448 449 // if (OUTPUT_STYLE != OutputStyle.XML) { 450 // printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null); 451 // } 452 453 printDefaultContent(toMaximized); 454 455 System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR); 456 457 } 458 459 static class RowData implements Comparable<RowData> { 460 OfficialStatus os; 461 String name; 462 Long pop; 463 RowData(OfficialStatus os, String name, Long pop)464 public RowData(OfficialStatus os, String name, Long pop) { 465 this.os = os; 466 this.name = name; 467 this.pop = pop; 468 } 469 getStatus()470 public OfficialStatus getStatus() { 471 // TODO Auto-generated method stub 472 return os; 473 } 474 getName()475 public CharSequence getName() { 476 // TODO Auto-generated method stub 477 return name; 478 } 479 getLiteratePopulation()480 public Long getLiteratePopulation() { 481 // TODO Auto-generated method stub 482 return pop; 483 } 484 compareTo(RowData o)485 public int compareTo(RowData o) { 486 // TODO Auto-generated method stub 487 int result = os.compareTo(o.os); 488 if (result != 0) return -result; 489 long result2 = pop - o.pop; 490 if (result2 != 0) return result2 < 0 ? 1 : -1; 491 return name.compareTo(o.name); 492 } 493 equals(Object o)494 public boolean equals(Object o) { 495 return 0 == compareTo((RowData) o); 496 } 497 hashCode()498 public int hashCode() { 499 throw new UnsupportedOperationException(); 500 } 501 } 502 printDefaultLanguagesAndScripts()503 private static void printDefaultLanguagesAndScripts() { 504 505 final int minTotalPopulation = 10000000; 506 final int minTerritoryPopulation = 1000000; 507 final double minTerritoryPercent = 1.0 / 3; 508 Map<String, Set<RowData>> languageToReason = new TreeMap<String, Set<RowData>>(); 509 Counter<String> languageToLiteratePopulation = new Counter<String>(); 510 NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH); 511 nf.setGroupingUsed(true); 512 LanguageTagParser ltp = new LanguageTagParser(); 513 LikelySubtags likelySubtags = new LikelySubtags(); 514 /* 515 * A. X is a qualified language**, and at least one of the following is true: 516 * 517 * 1. X is has official status* in any country 518 * 2. X exceeds a threshold population† of literate users worldwide: 1M 519 * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†. 520 * 521 * B. X is an exception explicitly approved by the committee or X has minimal 522 * language coverage‡ in CLDR itself. 523 */ 524 OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official; 525 Map<String, String> languages = new TreeMap<String, String>(); 526 for (String language : standardCodes.getAvailableCodes("language")) { 527 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 528 String result = english.getStringValue(path); 529 if (result != null) { 530 languages.put(language, result); 531 } 532 } 533 for (String language : languages.keySet()) { 534 System.out.println(language + "\t" + languages.get(language)); 535 } 536 537 for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 538 PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory); 539 double territoryPopulation = territoryPop.getLiteratePopulation(); 540 for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) { 541 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript, 542 territory); 543 ltp.set(languageScript); 544 String language = ltp.getLanguage(); 545 // if (ltp.getScript().isEmpty()) { 546 // String max = likelySubtags.maximize(languageScript); 547 // if (max != null) { 548 // ltp.set(max).setRegion(""); 549 // languageScript = ltp.toString(); 550 // } 551 // } 552 boolean add = false; 553 // #1 554 OfficialStatus status = popData.getOfficialStatus(); 555 if (status.compareTo(minimalStatus) >= 0) { 556 add = true; 557 } 558 long literatePopulation = getWritingPopulation(popData); 559 // #2 560 languageToLiteratePopulation.add(language, literatePopulation); 561 // #3 562 if (literatePopulation > minTerritoryPopulation 563 && literatePopulation > minTerritoryPercent * territoryPopulation) { 564 add = true; 565 } 566 if (add) { 567 add(languageToReason, language, territory, status, literatePopulation); 568 // Add the containing regions 569 for (String container : Containment.leafToContainer(territory)) { 570 add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation); 571 } 572 } 573 } 574 } 575 // #2, now that we have the data 576 for (String language : languageToLiteratePopulation.keySet()) { 577 long totalPop = languageToLiteratePopulation.getCount(language); 578 if (totalPop > minTotalPopulation) { 579 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop); 580 } 581 } 582 583 // Specials 584 add(languageToReason, "und", "001", OfficialStatus.unknown, 0); 585 586 // for (String language : Iso639Data.getAvailable()) { 587 // Scope scope = Iso639Data.getScope(language); 588 // Type type = Iso639Data.getType(language); 589 // if (scope == Scope.Special) { 590 // add(languageToReason, language, "001", OfficialStatus.unknown, -1); 591 // } 592 // } 593 // print them 594 595 System.out.println("Detailed - Including:\t" + languageToReason.size()); 596 597 for (String language : languageToReason.keySet()) { 598 Set<RowData> reasons = languageToReason.get(language); 599 600 RowData lastReason = reasons.iterator().next(); 601 602 System.out.append(language) 603 .append("\t") 604 .append(english.getName(language)) 605 .append("\t") 606 .append(lastReason.getStatus().toShortString()) 607 .append("\t") 608 .append(nf.format(languageToLiteratePopulation.getCount(language))); 609 for (RowData reason : reasons) { 610 String status = reason.getStatus().toShortString(); 611 System.out.append("\t") 612 .append(status) 613 .append("-") 614 .append(reason.getName()) 615 .append("-") 616 .append(nf.format(reason.getLiteratePopulation())); 617 } 618 System.out.append("\n"); 619 } 620 621 // now list them 622 623 Set<String> others = new TreeSet<String>(); 624 others.addAll(standardCodes.getGoodAvailableCodes("language")); 625 others.removeAll(languageToReason.keySet()); 626 System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size()); 627 showLanguages(languageToReason.keySet(), languageToReason); 628 System.out.println("\nExcluded Languages:\t" + others.size()); 629 showLanguages(others, languageToReason); 630 } 631 getWritingPopulation(PopulationData popData)632 private static long getWritingPopulation(PopulationData popData) { 633 final double writingPopulation = popData.getWritingPopulation(); 634 if (!Double.isNaN(writingPopulation)) { 635 return (long) writingPopulation; 636 } 637 return (long) popData.getLiteratePopulation(); 638 } 639 showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)640 private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) { 641 Set<String> sorted = new TreeSet<String>(Collator.getInstance(ULocale.ENGLISH)); 642 for (String language : others) { 643 sorted.add(getLanguageName(language, languageToReason)); 644 } 645 char last = 0; 646 for (String language : sorted) { 647 final char curr = language.charAt(0); 648 if (last != curr) { 649 System.out.println(); 650 } else if (last != '\u0000') { 651 System.out.print(", "); 652 } 653 System.out.print(language); 654 last = curr; 655 } 656 System.out.println(); 657 } 658 getLanguageName(String language, Map<String, Set<RowData>> languageToReason)659 private static String getLanguageName(String language, 660 Map<String, Set<RowData>> languageToReason) { 661 OfficialStatus best = OfficialStatus.unknown; 662 Set<RowData> reasons = languageToReason.get(language); 663 if (reasons != null) { 664 for (RowData reason : reasons) { 665 final OfficialStatus currentStatus = reason.getStatus(); 666 if (best.compareTo(currentStatus) < 0) { 667 best = currentStatus; 668 } 669 } 670 } 671 String status = best.toShortString(); 672 Scope scope = Iso639Data.getScope(language); 673 if (scope == Scope.Special) { 674 status = "S"; 675 } 676 String languageFormatted = english.getName(language) + " [" + language + "]-" + status; 677 return languageFormatted; 678 } 679 add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)680 private static void add(Map<String, Set<RowData>> languageToReason, String language, 681 String territoryRaw, OfficialStatus status, long population) { 682 String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]"; 683 Set<RowData> set = languageToReason.get(language); 684 if (set == null) { 685 languageToReason.put(language, set = new TreeSet<RowData>()); 686 } 687 set.add(new RowData(status, territory, population)); 688 } 689 printDefaultContent(Map<String, String> toMaximized)690 private static void printDefaultContent(Map<String, String> toMaximized) throws IOException { 691 692 Set<String> defaultLocaleContent = new TreeSet<String>(); 693 694 // go through all the cldr locales, and add default contents 695 // now computed from toMaximized 696 Set<String> available = factory.getAvailable(); 697 Relation<String, String> toChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 698 LanguageTagParser ltp = new LanguageTagParser(); 699 700 // System.out.println(maximize("az_Latn_AZ", toMaximized)); 701 Set<String> hasScript = new TreeSet<String>(); 702 703 // first get a mapping to children 704 for (String locale : available) { 705 if (locale.equals("root")) { 706 continue; 707 } 708 if (ltp.set(locale).getVariants().size() != 0) { 709 continue; 710 } 711 String parent = LocaleIDParser.getSimpleParent(locale); 712 if (ltp.getScript().length() != 0) { 713 hasScript.add(parent); 714 } 715 if (parent.equals("root")) { 716 continue; 717 } 718 toChildren.put(parent, locale); 719 } 720 721 // Suppress script for locales for which we only have one locale in common/main. See ticket #7834. 722 Set<String> suppressScriptLocales = new HashSet<String>(Arrays.asList( 723 "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", 724 "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE", 725 "blt_VN")); 726 727 // if any have a script, then throw out any that don't have a script (unless they're specifically included.) 728 Set<String> toRemove = new TreeSet<String>(); 729 for (String locale : hasScript) { 730 toRemove.clear(); 731 Set<String> children = toChildren.getAll(locale); 732 for (String child : children) { 733 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) { 734 toRemove.add(child); 735 } 736 } 737 if (toRemove.size() != 0) { 738 System.out.println("Removing:\t" + locale + "\t" + toRemove + "\tfrom\t" + children); 739 toChildren.removeAll(locale, toRemove); 740 } 741 } 742 743 // we add a child as a default locale if it has the same maximization 744 main: for (String locale : toChildren.keySet()) { 745 String maximized = maximize(locale, toMaximized); 746 if (maximized == null) { 747 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale); 748 continue; 749 } 750 Set<String> children = toChildren.getAll(locale); 751 Map<String, String> debugStuff = new TreeMap<String, String>(); 752 for (String child : children) { 753 String maximizedChild = maximize(child, toMaximized); 754 if (maximized.equals(maximizedChild)) { 755 defaultLocaleContent.add(child); 756 continue main; 757 } 758 debugStuff.put(child, maximizedChild); 759 } 760 if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized 761 + "\tin\t" + debugStuff); 762 } 763 764 defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. 765 766 showDefaultContentDifferencesAndFix(defaultLocaleContent); 767 768 Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml"); 769 BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml"); 770 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false); 771 772 String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t"; 773 String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep, 774 PatternCache.get("(\\S)\\S*").matcher(""), 80); 775 776 Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 777 Log.println("\t\t/>"); 778 779 // Log.println("</supplementalData>"); 780 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching > 781 CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest 782 783 Log.close(); 784 oldFile.close(); 785 } 786 787 // private static void oldAlgorithm(Map<String,String> toMaximized) { 788 // Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales(); 789 // LanguageTagParser parser = new LanguageTagParser(); 790 // for (String locale : defaultContentLocales) { 791 // String parent = parser.getParent(locale); 792 // toMaximized.put(parent, locale); 793 // if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent"); 794 // } 795 // 796 // for (String[] specialCase : SpecialCases) { 797 // toMaximized.put(specialCase[0], specialCase[1]); 798 // if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial"); 799 // } 800 // 801 // // recurse and close 802 // closeMapping(toMaximized); 803 // 804 // addScript(toMaximized, parser); 805 // 806 // closeMapping(toMaximized); 807 // 808 // addLanguageScript(toMaximized, parser); 809 // 810 // closeMapping(toMaximized); 811 // 812 // addLanguageCountry(toMaximized, parser); 813 // 814 // closeMapping(toMaximized); 815 // 816 // addCountries(toMaximized); 817 // addScript(toMaximized, parser); 818 // closeMapping(toMaximized); 819 // closeUnd(toMaximized); 820 // 821 // addDeprecated(toMaximized); 822 // 823 // closeMapping(toMaximized); 824 // 825 // checkConsistency(toMaximized); 826 // } 827 828 private static class MaxData { 829 Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 830 Map<String, Counter<String>> languagesToScripts = new TreeMap<String, Counter<String>>(); 831 Map<String, Counter<String>> languagesToRegions = new TreeMap<String, Counter<String>>(); 832 833 Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 834 Map<String, Counter<String>> scriptsToLanguages = new TreeMap<String, Counter<String>>(); 835 Map<String, Counter<String>> scriptsToRegions = new TreeMap<String, Counter<String>>(); 836 837 Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 838 Map<String, Counter<String>> regionsToLanguages = new TreeMap<String, Counter<String>>(); 839 Map<String, Counter<String>> regionsToScripts = new TreeMap<String, Counter<String>>(); 840 841 Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<String, Counter<Row.R2<String, String>>>(); 842 Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of( 843 new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class); 844 845 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of( 846 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 847 TreeSet.class); 848 Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of( 849 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 850 TreeSet.class); 851 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of( 852 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 853 TreeSet.class); 854 855 /** 856 * Add population information. "order" is the negative of the population (makes the first be the highest). 857 * @param language 858 * @param script 859 * @param region 860 * @param order 861 */ add(String language, String script, String region, Double order)862 void add(String language, String script, String region, Double order) { 863 if (language.equals("cpp")) { 864 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order); 865 } 866 languages.put(language, Row.of(order, script, region)); 867 // addCounter(languagesToScripts, language, script, order); 868 // addCounter(languagesToRegions, language, region, order); 869 870 scripts.put(script, Row.of(order, language, region)); 871 // addCounter(scriptsToLanguages, script, language, order); 872 // addCounter(scriptsToRegions, script, region, order); 873 874 regions.put(region, Row.of(order, language, script)); 875 // addCounter(regionsToLanguages, region, language, order); 876 // addCounter(regionsToScripts, region, script, order); 877 878 languageScripts.put(Row.of(language, script), Row.of(order, region)); 879 scriptRegions.put(Row.of(script, region), Row.of(order, language)); 880 languageRegions.put(Row.of(language, region), Row.of(order, script)); 881 882 Set<String> containerSet = Containment.leafToContainer(region); 883 if (containerSet != null) { 884 for (String container : containerSet) { 885 886 containersToLangRegion.put(container, Row.of(order, language, script, region)); 887 Counter<R2<String, String>> data = containersToLanguage.get(container); 888 if (data == null) { 889 containersToLanguage.put(container, data = new Counter<R2<String, String>>()); 890 } 891 data.add(Row.of(language, script), (long) (double) order); 892 893 } 894 } 895 896 if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order); 897 } 898 // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) { 899 // Counter<String> counter = map.get(key); 900 // if (counter == null) { 901 // map.put(key, counter = new Counter<String>()); 902 // } 903 // counter.add(key2, count.longValue()); 904 // } 905 } 906 907 private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000; 908 private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20; 909 private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000; 910 private static final double UNOFFICIAL_SCALE_DOWN = 0.2; 911 912 private static NumberFormat percent = NumberFormat.getPercentInstance(); 913 private static NumberFormat number = NumberFormat.getIntegerInstance(); 914 tryDifferentAlgorithm(Map<String, String> toMaximized)915 private static void tryDifferentAlgorithm(Map<String, String> toMaximized) { 916 // we are going to try a different approach. 917 // first gather counts for maximized values 918 // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap(); 919 MaxData maxData = new MaxData(); 920 Set<String> cldrLocales = factory.getAvailable(); 921 Set<String> otherTerritories = new TreeSet<String>(standardCodes.getGoodAvailableCodes("territory")); 922 923 // process all the information to get the top values for each triple. 924 // each of the combinations of 1 or 2 components gets to be a key. 925 for (String region : supplementalData.getTerritoriesWithPopulationData()) { 926 otherTerritories.remove(region); 927 PopulationData regionData = supplementalData.getPopulationDataForTerritory(region); 928 final double literateTerritoryPopulation = regionData.getLiteratePopulation(); 929 // we need any unofficial language to meet a certain absolute size requirement and proportion size 930 // requirement. 931 // so the bar is x percent of the population, reset up to y absolute size. 932 double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION; 933 if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) { 934 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE; 935 } 936 937 for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) { 938 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region); 939 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation(); 940 double order = -literatePopulation; // negative so we get the inverse order 941 942 if (data.getOfficialStatus() == OfficialStatus.unknown) { 943 final String locale = writtenLanguage + "_" + region; 944 if (literatePopulation >= minimalLiteratePopulation) { 945 // ok, skip 946 } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) { 947 // ok, skip 948 } else { 949 // if (SHOW_ADD) 950 // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" 951 // + english.getName(locale) 952 // + "\t-- too small:\t" + number.format(literatePopulation)); 953 // continue; 954 } 955 order *= UNOFFICIAL_SCALE_DOWN; 956 if (SHOW_ADD) 957 System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t" 958 + english.getName(locale) 959 + "\t" + number.format(literatePopulation) 960 + "\t" + percent.format(literatePopulation / literateTerritoryPopulation) 961 + (cldrLocales.contains(locale) ? "\tin-CLDR" : "")); 962 } 963 String script; 964 String language = writtenLanguage; 965 final int pos = writtenLanguage.indexOf('_'); 966 if (pos > 0) { 967 language = writtenLanguage.substring(0, pos); 968 script = writtenLanguage.substring(pos + 1); 969 } else { 970 script = getScriptForLocale2(language); 971 } 972 maxData.add(language, script, region, order); 973 } 974 } 975 976 LanguageTagParser additionLtp = new LanguageTagParser(); 977 978 for (String addition : MAX_ADDITIONS) { 979 additionLtp.set(addition); 980 String lan = additionLtp.getLanguage(); 981 Set<R3<Double, String, String>> key = maxData.languages.get(lan); 982 if (key == null) { 983 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0); 984 } else { 985 int debug = 0; 986 } 987 } 988 989 for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) { 990 String language = entry.getKey(); 991 final Collection<String> values = entry.getValue(); 992 if (values.size() != 1) { 993 continue; // skip, no either way 994 } 995 Set<R3<Double, String, String>> old = maxData.languages.get(language); 996 if (!maxData.languages.containsKey(language)) { 997 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); 998 } 999 } 1000 1001 // add others, with English default 1002 for (String region : otherTerritories) { 1003 if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS 1004 maxData.add("en", "Latn", region, 1.0); 1005 } 1006 1007 // get a reverse mapping, so that we can add the aliases 1008 1009 Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo() 1010 .get("language"); 1011 for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) { 1012 String reason = str.getValue().get1(); 1013 if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) { 1014 continue; 1015 } 1016 List<String> replacements = str.getValue().get0(); 1017 if (replacements == null) { 1018 continue; 1019 } 1020 String goodLanguage = replacements.get(0); 1021 1022 String badLanguage = str.getKey(); 1023 if (badLanguage.contains("_")) { 1024 continue; 1025 } 1026 if (deprecatedISONotInLST.contains(badLanguage)) { 1027 continue; 1028 } 1029 Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage); 1030 if (goodLanguageData == null) { 1031 continue; 1032 } 1033 R3<Double, String, String> value = goodLanguageData.iterator().next(); 1034 final String script = value.get1(); 1035 final String region = value.get2(); 1036 maxData.add(badLanguage, script, region, 1.0); 1037 System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason); 1038 } 1039 1040 // now, get the best for each one 1041 for (String language : maxData.languages.keySet()) { 1042 R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next(); 1043 final Comparable<String> script = value.get1(); 1044 final Comparable<String> region = value.get2(); 1045 add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", Override.REPLACE_EXISTING, 1046 SHOW_ADD); 1047 } 1048 for (String language : maxData.languagesToScripts.keySet()) { 1049 String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next(); 1050 add(language, language + "_" + script, toMaximized, "L->S", Override.REPLACE_EXISTING, SHOW_ADD); 1051 } 1052 for (String language : maxData.languagesToRegions.keySet()) { 1053 String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next(); 1054 add(language, language + "_" + region, toMaximized, "L->R", Override.REPLACE_EXISTING, SHOW_ADD); 1055 } 1056 1057 for (String script : maxData.scripts.keySet()) { 1058 R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next(); 1059 final Comparable<String> language = value.get1(); 1060 final Comparable<String> region = value.get2(); 1061 add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR", 1062 Override.REPLACE_EXISTING, SHOW_ADD); 1063 } 1064 for (String script : maxData.scriptsToLanguages.keySet()) { 1065 String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next(); 1066 add("und_" + script, language + "_" + script, toMaximized, "S->L", Override.REPLACE_EXISTING, SHOW_ADD); 1067 } 1068 for (String script : maxData.scriptsToRegions.keySet()) { 1069 String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next(); 1070 add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", Override.REPLACE_EXISTING, 1071 SHOW_ADD); 1072 } 1073 1074 for (String region : maxData.regions.keySet()) { 1075 R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next(); 1076 final Comparable<String> language = value.get1(); 1077 final Comparable<String> script = value.get2(); 1078 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS", 1079 Override.REPLACE_EXISTING, SHOW_ADD); 1080 } 1081 for (String region : maxData.regionsToLanguages.keySet()) { 1082 String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next(); 1083 add("und_" + region, language + "_" + region, toMaximized, "R->L", Override.REPLACE_EXISTING, SHOW_ADD); 1084 } 1085 for (String region : maxData.regionsToScripts.keySet()) { 1086 String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next(); 1087 add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", Override.REPLACE_EXISTING, 1088 SHOW_ADD); 1089 } 1090 1091 for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) { 1092 String region = containerAndInfo.getKey(); 1093 if (region.equals("001")) { 1094 continue; 1095 } 1096 Counter<R2<String, String>> data = containerAndInfo.getValue(); 1097 Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true); 1098 if (SHOW_CONTAINERS) { // debug 1099 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null))); 1100 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region)); 1101 } 1102 R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative 1103 final Comparable<String> language = value.get0(); 1104 final Comparable<String> script = value.get1(); 1105 1106 // fix special cases like es-419, where a locale exists. 1107 // for those cases, what we add as output is the container. Otherwise the region. 1108 Set<String> skipLanguages = cldrContainerToLanguages.get(region); 1109 if (skipLanguages != null 1110 && skipLanguages.contains(language)) { 1111 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS", 1112 Override.REPLACE_EXISTING, SHOW_ADD); 1113 continue; 1114 } 1115 1116 // we now have the best language and script. Find the best region for that 1117 for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) { 1118 final Comparable<String> language2 = e.get1(); 1119 final Comparable<String> script2 = e.get2(); 1120 if (language2.equals(language) && script2.equals(script)) { 1121 add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS", 1122 Override.REPLACE_EXISTING, SHOW_ADD); 1123 break; 1124 } 1125 } 1126 } 1127 1128 for (R2<String, String> languageScript : maxData.languageScripts.keySet()) { 1129 R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next(); 1130 final Comparable<String> language = languageScript.get0(); 1131 final Comparable<String> script = languageScript.get1(); 1132 final Comparable<String> region = value.get1(); 1133 add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R", 1134 Override.REPLACE_EXISTING, SHOW_ADD); 1135 } 1136 1137 for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) { 1138 R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next(); 1139 final Comparable<String> script = scriptRegion.get0(); 1140 final Comparable<String> region = scriptRegion.get1(); 1141 final Comparable<String> language = value.get1(); 1142 add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L", 1143 Override.REPLACE_EXISTING, SHOW_ADD); 1144 } 1145 1146 for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) { 1147 R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next(); 1148 final Comparable<String> language = languageRegion.get0(); 1149 final Comparable<String> region = languageRegion.get1(); 1150 final Comparable<String> script = value.get1(); 1151 add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S", 1152 Override.REPLACE_EXISTING, SHOW_ADD); 1153 } 1154 1155 // get the script info from metadata as fallback 1156 1157 TreeSet<String> sorted = new TreeSet<String>(ScriptMetadata.getScripts()); 1158 for (String script : sorted) { 1159 Info i = ScriptMetadata.getInfo(script); 1160 String likelyLanguage = i.likelyLanguage; 1161 String originCountry = i.originCountry; 1162 final String result = likelyLanguage + "_" + script + "_" + originCountry; 1163 add("und_" + script, result, toMaximized, "S->LR•", 1164 Override.KEEP_EXISTING, SHOW_ADD); 1165 add(likelyLanguage, result, toMaximized, "L->SR•", 1166 Override.KEEP_EXISTING, SHOW_ADD); 1167 } 1168 1169 // add overrides 1170 for (String key : LANGUAGE_OVERRIDES.keySet()) { 1171 add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", Override.REPLACE_EXISTING, true); 1172 } 1173 } 1174 shorten(Object data)1175 public static String shorten(Object data) { 1176 String info = data.toString(); 1177 if (info.length() > 255) { 1178 info = info.substring(0, 127) + "…"; 1179 } 1180 return info; 1181 } 1182 doAlt(Map<String, String> toMaximized)1183 private static void doAlt(Map<String, String> toMaximized) { 1184 // TODO Auto-generated method stub 1185 Map<String, String> temp = new TreeMap<String, String>(); 1186 for (String locale : toMaximized.keySet()) { 1187 String target = toMaximized.get(locale); 1188 temp.put(toAlt(locale, true), toAlt(target, true)); 1189 } 1190 toMaximized.clear(); 1191 toMaximized.putAll(temp); 1192 } 1193 maximize(String languageTag, Map<String, String> toMaximized)1194 public static String maximize(String languageTag, Map<String, String> toMaximized) { 1195 LanguageTagParser ltp = new LanguageTagParser(); 1196 1197 // clean up the input by removing Zzzz, ZZ, and changing "" into und. 1198 ltp.set(languageTag); 1199 String language = ltp.getLanguage(); 1200 String region = ltp.getRegion(); 1201 String script = ltp.getScript(); 1202 boolean changed = false; 1203 if (language.equals("")) { 1204 ltp.setLanguage(language = "und"); 1205 changed = true; 1206 } 1207 if (region.equals(UNKNOWN_SCRIPT)) { 1208 ltp.setScript(script = ""); 1209 changed = true; 1210 } 1211 if (ltp.getRegion().equals(UNKNOWN_REGION)) { 1212 ltp.setRegion(region = ""); 1213 changed = true; 1214 } 1215 if (changed) { 1216 languageTag = ltp.toString(); 1217 } 1218 // check whole 1219 String result = toMaximized.get(languageTag); 1220 if (result != null) { 1221 return result; 1222 } 1223 // try empty region 1224 if (region.length() != 0) { 1225 result = toMaximized.get(ltp.setRegion("").toString()); 1226 if (result != null) { 1227 return ltp.set(result).setRegion(region).toString(); 1228 } 1229 ltp.setRegion(region); // restore 1230 } 1231 // try empty script 1232 if (script.length() != 0) { 1233 result = toMaximized.get(ltp.setScript("").toString()); 1234 if (result != null) { 1235 return ltp.set(result).setScript(script).toString(); 1236 } 1237 // try empty script and region 1238 if (region.length() != 0) { 1239 result = toMaximized.get(ltp.setRegion("").toString()); 1240 if (result != null) { 1241 return ltp.set(result).setScript(script).setRegion(region).toString(); 1242 } 1243 } 1244 } 1245 if (!language.equals("und") && script.length() != 0 && region.length() != 0) { 1246 return languageTag; // it was ok, and we couldn't do anything with it 1247 } 1248 return null; // couldn't maximize 1249 } 1250 minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1251 public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) { 1252 if (input.equals("nb_Latn_SJ")) { 1253 System.out.print(""); // debug 1254 } 1255 String maximized = maximize(input, toMaximized); 1256 if (maximized == null) { 1257 return null; // failed 1258 } 1259 LanguageTagParser ltp = new LanguageTagParser().set(maximized); 1260 String language = ltp.getLanguage(); 1261 String region = ltp.getRegion(); 1262 String script = ltp.getScript(); 1263 // try building up from shorter to longer, and find the first that matches 1264 // could be more optimized, but for this code we want simplest 1265 String[] trials = { language, 1266 language + TAG_SEPARATOR + (favorRegion ? region : script), 1267 language + TAG_SEPARATOR + (!favorRegion ? region : script) }; 1268 for (String trial : trials) { 1269 String newMaximized = maximize(trial, toMaximized); 1270 if (maximized.equals(newMaximized)) { 1271 return trial; 1272 } 1273 } 1274 return maximized; 1275 } 1276 1277 // /** 1278 // * Verify that we can map from each language, script, and country to something. 1279 // * @param toMaximized 1280 // */ 1281 // private static void checkConsistency(Map<String, String> toMaximized) { 1282 // Map<String,String> needMappings = new TreeMap(); 1283 // LanguageTagParser parser = new LanguageTagParser(); 1284 // for (String maximized : new TreeSet<String>(toMaximized.values())) { 1285 // parser.set(maximized); 1286 // final String language = parser.getLanguage(); 1287 // final String script = parser.getScript(); 1288 // final String region = parser.getRegion(); 1289 // if (language.length() == 0 || script.length() == 0 || region.length() == 0) { 1290 // failure(" { \"" + maximized + "\", \"" + maximized + "\" }, // " + english.getName(maximized) + 1291 // "\t\tFailed-Consistency"); 1292 // continue; 1293 // } 1294 // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency"); 1295 // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency"); 1296 // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1297 // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency"); 1298 // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1299 // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency"); 1300 // } 1301 // toMaximized.putAll(needMappings); 1302 // } 1303 1304 // private static void failure(String string) { 1305 // System.out.println(string); 1306 // errorCount++; 1307 // } 1308 1309 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String> 1310 // otherToCheck, String kind) { 1311 // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind); 1312 // } 1313 1314 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey, 1315 // Set<String> skipValue, String kind) { 1316 // if (!key.equals(value) 1317 // && !toAdd.containsKey(key) 1318 // && (skipKey == null || !skipKey.contains(key)) 1319 // && (skipValue == null || !skipValue.contains(value))) { 1320 // add(key, value, toAdd, kind); 1321 // } 1322 // } 1323 1324 enum Override { 1325 KEEP_EXISTING, REPLACE_EXISTING 1326 } 1327 add(String key, String value, Map<String, String> toAdd, String kind, Override override, boolean showAction)1328 private static void add(String key, String value, Map<String, String> toAdd, String kind, Override override, 1329 boolean showAction) { 1330 if (key.equals(DEBUG_ADD_KEY)) { 1331 System.out.println("*debug*"); 1332 } 1333 String oldValue = toAdd.get(key); 1334 if (oldValue == null) { 1335 if (showAction) { 1336 System.out.println("Adding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind); 1337 } 1338 } else if (override == Override.KEEP_EXISTING || value.equals(oldValue)) { 1339 // if (showAction) { 1340 // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind); 1341 // } 1342 return; 1343 } else { 1344 if (showAction) { 1345 System.out.println("Replacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind); 1346 } 1347 } 1348 toAdd.put(key, value); 1349 } 1350 getName(String value)1351 private static String getName(String value) { 1352 return ConvertLanguageData.getLanguageCodeAndName(value); 1353 } 1354 1355 // private static void addCountries(Map<String, String> toMaximized) { 1356 // Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap(); 1357 // 1358 // for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 1359 // Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory); 1360 // String biggestOfficial = null; 1361 // double biggest = -1; 1362 // for (String language : languages) { 1363 // PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory); 1364 // // add to info about script 1365 // 1366 // String script = getScriptForLocale(language); 1367 // if (script != null) { 1368 // Map<String, Double> languageInfo = scriptToLanguageToSize.get(script); 1369 // if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap()); 1370 // String baseLanguage = language; 1371 // int pos = baseLanguage.indexOf('_'); 1372 // if (pos >= 0) { 1373 // baseLanguage = baseLanguage.substring(0,pos); 1374 // } 1375 // Double size = languageInfo.get(baseLanguage); 1376 // languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation()); 1377 // } 1378 // 1379 // 1380 // final OfficialStatus officialStatus = info.getOfficialStatus(); 1381 // if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) { 1382 // double size2 = info.getLiteratePopulation(); 1383 // if (biggest < size2) { 1384 // biggest = size2; 1385 // biggestOfficial = language; 1386 // } 1387 // } 1388 // } 1389 // if (biggestOfficial != null) { 1390 // final String replacementTag = "und_" + territory; 1391 // String maximized = biggestOfficial + "_" + territory; 1392 // toMaximized.put(replacementTag, maximized); 1393 // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory"); 1394 // } 1395 // } 1396 // 1397 // for (String script : scriptToLanguageToSize.keySet()) { 1398 // String biggestOfficial = null; 1399 // double biggest = -1; 1400 // 1401 // final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script); 1402 // for (String language : languageToSize.keySet()) { 1403 // double size = languageToSize.get(language); 1404 // if (biggest < size) { 1405 // biggest = size; 1406 // biggestOfficial = language; 1407 // } 1408 // } 1409 // if (biggestOfficial != null) { 1410 // final String replacementTag = "und_" + script; 1411 // String maximized = biggestOfficial + "_" + script; 1412 // toMaximized.put(replacementTag, maximized); 1413 // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script"); 1414 // } 1415 // } 1416 // } 1417 1418 // private static void closeUnd(Map<String, String> toMaximized) { 1419 // Map<String,String> toAdd = new TreeMap<String,String>(); 1420 // for (String oldSource : toMaximized.keySet()) { 1421 // String maximized = toMaximized.get(oldSource); 1422 // if (!maximized.startsWith("und")) { 1423 // int pos = maximized.indexOf("_"); 1424 // if (pos >= 0) { 1425 // addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd"); 1426 // } 1427 // } 1428 // } 1429 // toMaximized.putAll(toAdd); 1430 // } 1431 1432 /** 1433 * Generate tags where the deprecated values map to the expanded values 1434 * 1435 * @param toMaximized 1436 */ 1437 // private static void addDeprecated(Map<String, String> toMaximized) { 1438 // Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo(); 1439 // LanguageTagParser temp = new LanguageTagParser(); 1440 // LanguageTagParser tagParsed = new LanguageTagParser(); 1441 // LanguageTagParser replacementParsed = new LanguageTagParser(); 1442 // Map<String,String> toAdd = new TreeMap<String,String>(); 1443 // while (true) { 1444 // toAdd.clear(); 1445 // for (String type : typeToTagToReplacement.keySet()) { 1446 // if (type.equals("variant") || type.equals("zone")) continue; 1447 // boolean addUnd = !type.equals("language"); 1448 // 1449 // Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type); 1450 // System.out.println("*" + type + " = " + tagToReplacement); 1451 // 1452 // for (String tag: tagToReplacement.keySet()) { 1453 // 1454 // final List<String> list = tagToReplacement.get(tag); 1455 // if (list == null) continue; // we don't have any information 1456 // String replacement = list.get(0); 1457 // 1458 // // only do multiples 1459 // if (tag.contains("_") || !replacement.contains("_")) { 1460 // continue; 1461 // } 1462 // 1463 // // we now have a tag and a replacement value 1464 // // make parsers that we can use 1465 // try { 1466 // tagParsed.set(addUnd ? "und-" + tag : tag); 1467 // replacementParsed.set(addUnd ? "und-" + replacement : replacement); 1468 // } catch (RuntimeException e) { 1469 // continue; 1470 // } 1471 // addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated"); 1472 // 1473 // for (String locale : toMaximized.keySet()) { 1474 // String maximized = toMaximized.get(locale); 1475 // addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized); 1476 // addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized); 1477 // } 1478 // } 1479 // } 1480 // if (toAdd.size() == 0) { 1481 // break; 1482 // } 1483 // toMaximized.putAll(toAdd); 1484 // } 1485 // } 1486 1487 // private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed, 1488 // LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) { 1489 // if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) { 1490 // return; 1491 // } 1492 // if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) { 1493 // return; 1494 // } 1495 // if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) { 1496 // return; 1497 // } 1498 // if (!replacementParsed.getLanguage().equals("und")) { 1499 // locale.setLanguage(replacementParsed.getLanguage()); 1500 // } 1501 // if (!replacementParsed.getScript().equals("")) { 1502 // locale.setScript(replacementParsed.getScript()); 1503 // } 1504 // if (!replacementParsed.getRegion().equals("")) { 1505 // locale.setRegion(replacementParsed.getRegion()); 1506 // } 1507 // addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated"); 1508 // } 1509 1510 // private static int getSubtagPosition(String locale, String subtags) { 1511 // int pos = -1; 1512 // while (true) { 1513 // pos = locale.indexOf(subtags, pos + 1); 1514 // if (pos < 0) return -1; 1515 // // make sure boundaries are ok 1516 // if (pos != 0) { 1517 // char charBefore = locale.charAt(pos-1); 1518 // if (charBefore != '_' && charBefore != '_') return -1; 1519 // } 1520 // int limit = pos + subtags.length(); 1521 // if (limit != locale.length()) { 1522 // char charAfter = locale.charAt(limit); 1523 // if (charAfter != '_' && charAfter != '_') return -1; 1524 // } 1525 // return pos; 1526 // } 1527 // } 1528 1529 /* 1530 * Format 1531 * const DefaultSubtags default_subtags[] = { 1532 * { 1533 * // Afar => Afar (Latin, Ethiopia) 1534 * "aa", 1535 * "aa_Latn_ET" 1536 * },{ 1537 * // Afrikaans => Afrikaans (Latin, South Africa) 1538 * "af", 1539 * "af_Latn_ZA" 1540 * },{ 1541 */ 1542 printLikelySubtags(Map<String, String> fluffup)1543 private static void printLikelySubtags(Map<String, String> fluffup) throws IOException { 1544 1545 PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, 1546 "/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt")); 1547 String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " "; 1548 String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {" 1549 : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR 1550 + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">" 1551 + CldrUtility.LINE_SEPARATOR 1552 + "<!--" 1553 + CldrUtility.LINE_SEPARATOR 1554 + CldrUtility.getCopyrightString() 1555 + CldrUtility.LINE_SEPARATOR 1556 + "-->" 1557 + CldrUtility.LINE_SEPARATOR 1558 + "<!--" 1559 + CldrUtility.LINE_SEPARATOR 1560 + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR 1561 + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR 1562 + "not be patched by hand, as any changes made in that fashion may be lost." 1563 + CldrUtility.LINE_SEPARATOR 1564 + "-->" 1565 + CldrUtility.LINE_SEPARATOR 1566 + "<supplementalData>" + CldrUtility.LINE_SEPARATOR 1567 + " <version number=\"$" + 1568 "Revision$\"/>" + CldrUtility.LINE_SEPARATOR 1569 + " <likelySubtags>"; 1570 String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};" 1571 : " </likelySubtags>" + CldrUtility.LINE_SEPARATOR 1572 + "</supplementalData>"; 1573 out.println(header); 1574 boolean first = true; 1575 Set<String> keys = new TreeSet<String>(new LocaleStringComparator()); 1576 keys.addAll(fluffup.keySet()); 1577 for (String printingLocale : keys) { 1578 String printingTarget = fluffup.get(printingLocale); 1579 String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing 1580 + printingName(printingTarget, spacing); 1581 1582 if (OUTPUT_STYLE == OutputStyle.XML) { 1583 out.println("\t\t<likelySubtag from=\"" + printingLocale + 1584 "\" to=\"" + printingTarget + "\"" + 1585 "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->"); 1586 } else { 1587 if (first) { 1588 first = false; 1589 } else { 1590 out.print(","); 1591 } 1592 if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { 1593 comment = printingName(printingLocale, spacing) + SEPARATOR + " // " + spacing + "=>" + spacing 1594 + printingName(printingTarget, spacing); 1595 } 1596 out.print( 1597 " {" 1598 + SEPARATOR + " // " + comment 1599 + SEPARATOR + " \"" + printingLocale + "\"," 1600 + SEPARATOR + " \"" + printingTarget + "\"" 1601 + CldrUtility.LINE_SEPARATOR + " }"); 1602 } 1603 } 1604 out.println(footer); 1605 out.close(); 1606 } 1607 printingName(String locale, String spacing)1608 public static String printingName(String locale, String spacing) { 1609 if (locale == null) { 1610 return null; 1611 } 1612 LanguageTagParser parser = new LanguageTagParser().set(locale); 1613 String lang = parser.getLanguage(); 1614 String script = parser.getScript(); 1615 String region = parser.getRegion(); 1616 return "{" + spacing + 1617 (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing + 1618 (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing 1619 + 1620 (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing 1621 + "}"; 1622 } 1623 1624 private static final String[][] ALT_REVERSAL = { 1625 { "nb", "no" }, 1626 { "no", "nb" }, 1627 { "he", "iw" }, 1628 { "iw", "he" }, 1629 }; 1630 toAlt(String locale, boolean change)1631 public static String toAlt(String locale, boolean change) { 1632 if (!change || locale == null) { 1633 return locale; 1634 } 1635 String firstTag = getFirstTag(locale); 1636 for (String[] pair : ALT_REVERSAL) { 1637 if (firstTag.equals(pair[0])) { 1638 locale = pair[1] + locale.substring(pair[1].length()); 1639 break; 1640 } 1641 } 1642 locale = locale.replace("_", "-"); 1643 return locale; 1644 } 1645 getFirstTag(String locale)1646 private static String getFirstTag(String locale) { 1647 int pos = locale.indexOf('_'); 1648 return pos < 0 ? locale : locale.substring(0, pos); 1649 } 1650 1651 // private static Map<String, String> getBackMapping(Map<String, String> fluffup) { 1652 // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR); 1653 // for (String source : fluffup.keySet()) { 1654 // if (source.startsWith("und")) { 1655 // continue; 1656 // } 1657 // String maximized = fluffup.get(source); 1658 // backMap.put(maximized, source); // put in right order 1659 // } 1660 // Map<String,String> returnBackMap = new TreeMap(); 1661 // for (String maximized : backMap.keySet()) { 1662 // final Set<String> all = backMap.getAll(maximized); 1663 // final String minimized = all.iterator().next(); 1664 // returnBackMap.put(maximized, minimized); 1665 // } 1666 // return returnBackMap; 1667 // } 1668 1669 /** 1670 * Language tags are presumed to share the first language, except possibly "und". Best is least 1671 */ 1672 // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() { 1673 // LanguageTagParser p1 = new LanguageTagParser(); 1674 // LanguageTagParser p2 = new LanguageTagParser(); 1675 // public int compare(String o1, String o2) { 1676 // if (o1.equals(o2)) return 0; 1677 // p1.set(o1); 1678 // p2.set(o2); 1679 // String lang1 = p1.getLanguage(); 1680 // String lang2 = p2.getLanguage(); 1681 // 1682 // // compare languages first 1683 // // put und at the end 1684 // int result = lang1.compareTo(lang2); 1685 // if (result != 0) { 1686 // if (lang1.equals("und")) return 1; 1687 // if (lang2.equals("und")) return -1; 1688 // return result; 1689 // } 1690 // 1691 // // now scripts and regions. 1692 // // if they have different numbers of fields, the shorter wins. 1693 // // If there are two fields, region is lowest. 1694 // // The simplest way is to just compare scripts first 1695 // // so zh-TW < zh-Hant, because we first compare "" to Hant 1696 // String script1 = p1.getScript(); 1697 // String script2 = p2.getScript(); 1698 // int scriptOrder = script1.compareTo(script2); 1699 // if (scriptOrder != 0) return scriptOrder; 1700 // 1701 // String region1 = p1.getRegion(); 1702 // String region2 = p2.getRegion(); 1703 // int regionOrder = region1.compareTo(region2); 1704 // if (regionOrder != 0) return regionOrder; 1705 // 1706 // return o1.compareTo(o2); 1707 // } 1708 // 1709 // }; 1710 minimize(Map<String, String> fluffup)1711 public static void minimize(Map<String, String> fluffup) { 1712 LanguageTagParser parser = new LanguageTagParser(); 1713 LanguageTagParser targetParser = new LanguageTagParser(); 1714 Set<String> removals = new TreeSet<String>(); 1715 while (true) { 1716 removals.clear(); 1717 for (String locale : fluffup.keySet()) { 1718 String target = fluffup.get(locale); 1719 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) { 1720 removals.add(locale); 1721 if (SHOW_ADD) 1722 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1723 + "\t\t - Unknown Region in target"); 1724 continue; 1725 } 1726 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) { 1727 removals.add(locale); 1728 if (SHOW_ADD) 1729 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1730 + "\t\t - Unknown Script in target"); 1731 continue; 1732 } 1733 1734 String region = parser.set(locale).getRegion(); 1735 if (region.length() != 0) { 1736 if (region.equals(UNKNOWN_REGION)) { 1737 removals.add(locale); 1738 if (SHOW_ADD) 1739 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1740 + "\t\t - Unknown Region in source"); 1741 continue; 1742 } 1743 parser.setRegion(""); 1744 String newLocale = parser.toString(); 1745 String newTarget = fluffup.get(newLocale); 1746 if (newTarget != null) { 1747 newTarget = targetParser.set(newTarget).setRegion(region).toString(); 1748 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1749 removals.add(locale); 1750 if (SHOW_ADD) 1751 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1752 + newLocale); 1753 continue; 1754 } 1755 } 1756 } 1757 String script = parser.set(locale).getScript(); 1758 if (locale.equals(DEBUG_ADD_KEY)) { 1759 System.out.println("*debug*"); 1760 } 1761 if (script.length() != 0) { 1762 if (script.equals(UNKNOWN_SCRIPT)) { 1763 removals.add(locale); 1764 if (SHOW_ADD) 1765 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script"); 1766 continue; 1767 } 1768 parser.setScript(""); 1769 String newLocale = parser.toString(); 1770 String newTarget = fluffup.get(newLocale); 1771 if (newTarget != null) { 1772 newTarget = targetParser.set(newTarget).setScript(script).toString(); 1773 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1774 removals.add(locale); 1775 if (SHOW_ADD) 1776 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1777 + newLocale); 1778 continue; 1779 } 1780 } 1781 } 1782 } 1783 if (removals.size() == 0) { 1784 break; 1785 } 1786 for (String locale : removals) { 1787 fluffup.remove(locale); 1788 } 1789 } 1790 } 1791 1792 // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) { 1793 // // add script 1794 // Map<String, String> temp = new TreeMap<String, String>(); 1795 // while (true) { 1796 // temp.clear(); 1797 // for (String target : new TreeSet<String>(fluffup.values())) { 1798 // parser.set(target); 1799 // final String territory = parser.getRegion(); 1800 // if (territory.length() == 0) { 1801 // continue; 1802 // } 1803 // parser.setRegion(""); 1804 // String possibleSource = parser.toString(); 1805 // if (fluffup.containsKey(possibleSource)) { 1806 // continue; 1807 // } 1808 // String other = temp.get(possibleSource); 1809 // if (other != null) { 1810 // if (!target.equals(other)) { 1811 // System.out.println("**Failure with multiple sources in addLanguageScript: " 1812 // + possibleSource + "\t=>\t" + target + ", " + other); 1813 // } 1814 // continue; 1815 // } 1816 // temp.put(possibleSource, target); 1817 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script"); 1818 // } 1819 // if (temp.size() == 0) { 1820 // break; 1821 // } 1822 // fluffup.putAll(temp); 1823 // } 1824 // 1825 // } 1826 1827 // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) { 1828 // // add script 1829 // Map<String, String> temp = new TreeMap<String, String>(); 1830 // while (true) { 1831 // temp.clear(); 1832 // for (String target : new TreeSet<String>(fluffup.values())) { 1833 // parser.set(target); 1834 // String script = parser.getScript(); 1835 // if (script.length() == 0) { 1836 // continue; 1837 // } 1838 // parser.setScript(""); 1839 // String possibleSource = parser.toString(); 1840 // if (fluffup.containsKey(possibleSource)) { 1841 // continue; 1842 // } 1843 // String other = temp.get(possibleSource); 1844 // 1845 // if (other != null) { 1846 // if (!target.equals(other)) { 1847 // script = getScriptForLocale(possibleSource); 1848 // if (script == null) { 1849 // System.out.println("**Failure with multiple sources in addLanguageCountry: " 1850 // + possibleSource + "\t=>\t" + target + ", " + other); 1851 // continue; // error message in routine 1852 // } 1853 // parser.setScript(script); 1854 // target = parser.toString(); 1855 // } 1856 // } 1857 // 1858 // temp.put(possibleSource, target); 1859 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry"); 1860 // } 1861 // if (temp.size() == 0) { 1862 // break; 1863 // } 1864 // fluffup.putAll(temp); 1865 // } 1866 // 1867 // } 1868 1869 // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) { 1870 // // add script 1871 // Map<String, String> temp = new TreeMap<String, String>(); 1872 // while (true) { 1873 // temp.clear(); 1874 // Set skipTarget = fluffup.keySet(); 1875 // for (String locale : fluffup.keySet()) { 1876 // String target = fluffup.get(locale); 1877 // parser.set(target); 1878 // if (parser.getScript().length() != 0) { 1879 // continue; 1880 // } 1881 // String script = getScriptForLocale(target); 1882 // 1883 // if (script == null) { 1884 // continue; // error message in routine 1885 // } 1886 // parser.setScript(script); 1887 // String furtherTarget = parser.toString(); 1888 // addIfNotIn(target, furtherTarget, temp, fluffup, "Script"); 1889 // } 1890 // if (temp.size() == 0) { 1891 // break; 1892 // } 1893 // fluffup.putAll(temp); 1894 // } 1895 // } 1896 1897 // private static String getScriptForLocale(String locale) { 1898 // String result = getScriptForLocale2(locale); 1899 // if (result != null) return result; 1900 // int pos = locale.indexOf('_'); 1901 // if (pos >= 0) { 1902 // result = getScriptForLocale2(locale.substring(0,pos)); 1903 // } 1904 // return result; 1905 // } 1906 1907 private static String UNKNOWN_SCRIPT = "Zzzz"; 1908 private static String UNKNOWN_REGION = "ZZ"; 1909 getScriptForLocale2(String locale)1910 private static String getScriptForLocale2(String locale) { 1911 String result = localeToScriptCache.get(locale); 1912 if (result != null) { 1913 return result; 1914 } 1915 if (locale.equals("ky")) { 1916 int debug = 0; 1917 } 1918 try { 1919 Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale); 1920 if (data != null) { 1921 for (BasicLanguageData datum : data.values()) { 1922 final Set<String> scripts = datum.getScripts(); 1923 boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary; 1924 if (scripts.size() != 1) { 1925 if (scripts.size() > 1 && isPrimary) { 1926 break; 1927 } 1928 continue; 1929 } 1930 String script = scripts.iterator().next(); 1931 if (isPrimary) { 1932 return result = script; 1933 } else if (result == null) { 1934 result = script; 1935 } 1936 } 1937 if (result != null) { 1938 return result; 1939 } 1940 } 1941 CLDRFile cldrFile; 1942 try { 1943 cldrFile = factory.make(locale, true); 1944 } catch (RuntimeException e) { 1945 result = FALLBACK_SCRIPTS.get(locale); 1946 if (result == null) { 1947 System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale)); 1948 return result = UNKNOWN_SCRIPT; 1949 } else { 1950 return result; 1951 } 1952 } 1953 UnicodeSet exemplars = getExemplarSet(cldrFile, ""); 1954 Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars); 1955 CLDRScripts.remove(UNKNOWN_SCRIPT); 1956 if (CLDRScripts.size() == 1) { 1957 return result = CLDRScripts.iterator().next(); 1958 } else if (CLDRScripts.size() == 0) { 1959 System.out.println("**Failed to get script for:\t" + locale); 1960 return result = UNKNOWN_SCRIPT; 1961 } else { 1962 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts); 1963 return result = UNKNOWN_SCRIPT; 1964 } 1965 } finally { 1966 if (result.equals(UNKNOWN_SCRIPT)) { 1967 String temp = LANGUAGE_OVERRIDES.get(locale); 1968 if (temp != null) { 1969 result = new LanguageTagParser().set(temp).getScript(); 1970 System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result); 1971 } 1972 } 1973 localeToScriptCache.put(locale, result); 1974 if (SHOW_ADD) 1975 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t" 1976 + english.getName(CLDRFile.SCRIPT_NAME, result)); 1977 } 1978 } 1979 1980 // private static Map<String, String> closeMapping(Map<String, String> fluffup) { 1981 // if (SHOW_ADD) System.out.flush(); 1982 // Map<String,String> temp = new TreeMap<String,String>(); 1983 // while (true) { 1984 // temp.clear(); 1985 // for (String locale : fluffup.keySet()) { 1986 // String target = fluffup.get(locale); 1987 // if (target.equals("si_Sinh") || target.equals("zh-Hani")) { 1988 // System.out.println("????"); 1989 // } 1990 // String furtherTarget = fluffup.get(target); 1991 // if (furtherTarget == null) { 1992 // continue; 1993 // } 1994 // addIfNotIn(locale, furtherTarget, temp, null, "Close"); 1995 // } 1996 // if (temp.size() == 0) { 1997 // break; 1998 // } 1999 // fluffup.putAll(temp); 2000 // } 2001 // if (SHOW_ADD) System.out.flush(); 2002 // return temp; 2003 // } 2004 getScriptsFromUnicodeSet(UnicodeSet exemplars)2005 public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) { 2006 // use bits first, since that's faster 2007 BitSet scriptBits = new BitSet(); 2008 boolean show = false; 2009 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { 2010 if (show) 2011 System.out.println(Integer.toHexString(it.codepoint)); 2012 if (it.codepoint != UnicodeSetIterator.IS_STRING) { 2013 scriptBits.set(UScript.getScript(it.codepoint)); 2014 } else { 2015 int cp; 2016 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) { 2017 scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i))); 2018 } 2019 } 2020 } 2021 scriptBits.clear(UScript.COMMON); 2022 scriptBits.clear(UScript.INHERITED); 2023 Set<String> scripts = new TreeSet<String>(); 2024 for (int j = 0; j < scriptBits.size(); ++j) { 2025 if (scriptBits.get(j)) { 2026 scripts.add(UScript.getShortName(j)); 2027 } 2028 } 2029 return scripts; 2030 } 2031 getExemplarSet(CLDRFile cldrfile, String type)2032 public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) { 2033 if (type.length() != 0) 2034 type = "[@type=\"" + type + "\"]"; 2035 String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" 2036 + type); 2037 if (v == null) 2038 return new UnicodeSet(); 2039 return new UnicodeSet(v); 2040 } 2041 2042 // private static String[][] SpecialCases = { 2043 // { "zh_Hani", "zh_Hans_CN"}, 2044 // { "si_Sinh", "si_Sinh_LK"}, 2045 // { "ii", "ii_CN"}, // Sichuan Yi (Yi) 2046 // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics) 2047 // { "und", "en"}, // English default 2048 // }; 2049 showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)2050 static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) { 2051 Set<String> errors = new LinkedHashSet<String>(); 2052 Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents( 2053 ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors); 2054 if (!errors.isEmpty()) { 2055 System.out.println(CollectionUtilities.join(errors, "\n")); 2056 errors.clear(); 2057 } 2058 Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 2059 new TreeMap<String, String>(), errors); 2060 if (!errors.isEmpty()) { 2061 System.out.println("Default Content errors: " + CollectionUtilities.join(errors, "\n")); 2062 errors.clear(); 2063 } 2064 Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent, 2065 "ar", "ar_001"); 2066 System.out.println(CollectionUtilities.join(changes, "\n")); 2067 defaultLocaleContent.clear(); 2068 defaultLocaleContent.addAll(newDefaultContent.values()); 2069 newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 2070 new TreeMap<String, String>(), errors); 2071 if (!errors.isEmpty()) { 2072 System.out.println("***New Errors: " + CollectionUtilities.join(errors, "\n")); 2073 } 2074 } 2075 compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)2076 private static Set<String> compareMapsAndFixNew(String title, 2077 Map<String, String> oldContent, 2078 Map<String, String> newContent, String... allowedOverrideValues) { 2079 Map<String, String> allowedOverrideValuesTest = new HashMap<String, String>(); 2080 for (int i = 0; i < allowedOverrideValues.length; i += 2) { 2081 allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]); 2082 } 2083 Set<String> changes = new TreeSet<String>(); 2084 for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet()) 2085 .addAll(oldContent.keySet()).get()) { 2086 String oldValue = oldContent.get(parent); 2087 String newValue = newContent.get(parent); 2088 String overrideValue = allowedOverrideValuesTest.get(parent); 2089 if (overrideValue != null) { 2090 newContent.put(parent, overrideValue); 2091 newValue = overrideValue; 2092 } 2093 if (CldrUtility.equals(oldValue, newValue)) { 2094 continue; 2095 } 2096 String message; 2097 if (oldValue == null) { 2098 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2099 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2100 newContent.put(parent, newValue); 2101 } else if (newValue == null) { 2102 if (SUPPRESS_CHANGES) { 2103 message = "Suppressing removal of " 2104 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2105 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2106 newContent.put(parent, oldValue); 2107 } else { 2108 message = "Removing " 2109 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2110 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2111 newContent.remove(oldValue); 2112 } 2113 } else { 2114 if (SUPPRESS_CHANGES) { 2115 message = "Suppressing change of " 2116 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2117 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2118 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2119 newContent.remove(newValue); 2120 newContent.put(parent, oldValue); 2121 } else { 2122 message = "Changing " 2123 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2124 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2125 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2126 newContent.remove(oldValue); 2127 newContent.put(parent, newValue); 2128 } 2129 } 2130 changes.add(title + message); 2131 } 2132 return changes; 2133 } 2134 2135 public static class LocaleStringComparator implements Comparator<String> { 2136 LanguageTagParser ltp0 = new LanguageTagParser(); 2137 LanguageTagParser ltp1 = new LanguageTagParser(); 2138 compare(String arg0, String arg1)2139 public int compare(String arg0, String arg1) { 2140 ltp0.set(arg0); 2141 ltp1.set(arg1); 2142 String s0 = ltp0.getLanguage(); 2143 String s1 = ltp1.getLanguage(); 2144 int result = s0.compareTo(s1); 2145 if (result != 0) { 2146 return s0.equals("und") ? 1 2147 : s1.equals("und") ? -1 2148 : result; 2149 } 2150 s0 = ltp0.getScript(); 2151 s1 = ltp1.getScript(); 2152 result = s0.compareTo(s1); 2153 if (result != 0) { 2154 return result; 2155 } 2156 s0 = ltp0.getRegion(); 2157 s1 = ltp1.getRegion(); 2158 result = s0.compareTo(s1); 2159 if (result != 0) { 2160 return result; 2161 } 2162 return arg0.compareTo(arg1); // just in case 2163 } 2164 2165 } 2166 } 2167