1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.nio.file.Files; 8 import java.util.Arrays; 9 import java.util.BitSet; 10 import java.util.Collection; 11 import java.util.Comparator; 12 import java.util.HashMap; 13 import java.util.HashSet; 14 import java.util.LinkedHashSet; 15 import java.util.List; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 22 import org.unicode.cldr.draft.FileUtilities; 23 import org.unicode.cldr.draft.ScriptMetadata; 24 import org.unicode.cldr.draft.ScriptMetadata.Info; 25 import org.unicode.cldr.util.Builder; 26 import org.unicode.cldr.util.CLDRConfig; 27 import org.unicode.cldr.util.CLDRFile; 28 import org.unicode.cldr.util.CLDRLocale; 29 import org.unicode.cldr.util.CLDRPaths; 30 import org.unicode.cldr.util.CldrUtility; 31 import org.unicode.cldr.util.Containment; 32 import org.unicode.cldr.util.Counter; 33 import org.unicode.cldr.util.Factory; 34 import org.unicode.cldr.util.Iso639Data; 35 import org.unicode.cldr.util.Iso639Data.Scope; 36 import org.unicode.cldr.util.LanguageTagParser; 37 import org.unicode.cldr.util.LocaleIDParser; 38 import org.unicode.cldr.util.Organization; 39 import org.unicode.cldr.util.PatternCache; 40 import org.unicode.cldr.util.SimpleFactory; 41 import org.unicode.cldr.util.StandardCodes; 42 import org.unicode.cldr.util.StandardCodes.LstrType; 43 import org.unicode.cldr.util.SupplementalDataInfo; 44 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 45 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 46 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 47 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 48 import org.unicode.cldr.util.Validity; 49 import org.unicode.cldr.util.Validity.Status; 50 51 import com.google.common.base.Joiner; 52 import com.google.common.collect.ImmutableList; 53 import com.google.common.collect.ImmutableMap; 54 import com.google.common.collect.ImmutableSet; 55 import com.ibm.icu.impl.Relation; 56 import com.ibm.icu.impl.Row; 57 import com.ibm.icu.impl.Row.R2; 58 import com.ibm.icu.impl.Row.R3; 59 import com.ibm.icu.impl.Row.R4; 60 import com.ibm.icu.lang.UScript; 61 import com.ibm.icu.text.Collator; 62 import com.ibm.icu.text.NumberFormat; 63 import com.ibm.icu.text.UTF16; 64 import com.ibm.icu.text.UnicodeSet; 65 import com.ibm.icu.text.UnicodeSetIterator; 66 import com.ibm.icu.util.ULocale; 67 68 /** 69 * Problems: 70 * "und_Hani", "zh_Hani" 71 * "und_Sinh", "si_Sinh" 72 * 73 * @author markdavis 74 * 75 */ 76 public class GenerateMaximalLocales { 77 78 private static final Map<String, Status> LANGUAGE_CODE_TO_STATUS = Validity.getInstance().getCodeToStatus(LstrType.language); 79 80 private static final String TEMP_UNKNOWN_REGION = "XZ"; 81 82 private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; 83 84 private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false); 85 private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); 86 private static final boolean SHOW_CONTAINERS = false; 87 88 private static final boolean SHOW_ALL_LANGUAGE_CODES = false; 89 private static final boolean SHOW_DETAILED = false; 90 private static final boolean SHOW_INCLUDED_EXCLUDED = false; 91 enum OutputStyle { 92 PLAINTEXT, C, C_ALT, XML 93 } 94 95 private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML") 96 .toUpperCase()); 97 98 // set based on above 99 private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR 100 : "\t"; 101 private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; 102 // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT; 103 104 private static final boolean tryDifferent = true; 105 106 private static final File list[] = { 107 new File(CLDRPaths.MAIN_DIRECTORY), 108 new File(CLDRPaths.SEED_DIRECTORY), 109 new File(CLDRPaths.EXEMPLARS_DIRECTORY) }; 110 111 private static Factory factory = SimpleFactory.make(list, ".*"); 112 private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory(); 113 private static SupplementalDataInfo supplementalData = SupplementalDataInfo 114 .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); 115 private static StandardCodes standardCodes = StandardCodes.make(); 116 private static CLDRFile english = factory.make("en", false); 117 static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 118 static { 119 for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { 120 String region = locale.getCountry(); 121 if (region == null || region.isEmpty() || Containment.isLeaf(region)) { 122 continue; 123 } cldrContainerToLanguages.put(region, locale.getLanguage())124 cldrContainerToLanguages.put(region, locale.getLanguage()); 125 } cldrContainerToLanguages.freeze()126 cldrContainerToLanguages.freeze(); 127 System.out.println("Keep containers " + cldrContainerToLanguages); 128 } 129 130 private static final List<String> KEEP_TARGETS = Arrays.asList( 131 "und_Arab_PK", 132 "und_Latn_ET", 133 "hi_Latn" 134 ); 135 private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr"); 136 137 /** 138 * This is the simplest way to override, by supplying the max value. 139 * It gets a very low weight, so doesn't override any stronger value. 140 */ 141 private static final String[] MAX_ADDITIONS = new String[] { 142 "bss_Latn_CM", 143 "gez_Ethi_ET", 144 "ken_Latn_CM", 145 "und_Arab_PK", 146 "wa_Latn_BE", 147 148 "fub_Arab_CM", 149 "fuf_Latn_GN", 150 "kby_Arab_NE", 151 "kdh_Latn_TG", 152 "apd_Arab_TG", 153 "zlm_Latn_TG", 154 155 "cr_Cans_CA", 156 "hif_Latn_FJ", 157 "gon_Telu_IN", 158 "lzz_Latn_TR", 159 "lif_Deva_NP", 160 "unx_Beng_IN", 161 "unr_Beng_IN", 162 "ttt_Latn_AZ", 163 "pnt_Grek_GR", 164 "tly_Latn_AZ", 165 "tkr_Latn_AZ", 166 "bsq_Bass_LR", 167 "ccp_Cakm_BD", 168 "blt_Tavt_VN", 169 "rhg_Arab_MM", 170 "rhg_Rohg_MM", 171 "clc_Latn_CA", 172 "crg_Latn_CA", 173 "hur_Latn_CA", 174 "kwk_Latn_CA", 175 "lil_Latn_CA", 176 "ojs_Cans_CA", 177 "oka_Latn_CA", 178 "pqm_Latn_CA", 179 180 "hi_Latn_IN", 181 "no_Latn_NO", 182 "und_Cpmn_CY", 183 184 "hnj_Hmnp_US", 185 "rhg_Arab_MM" 186 }; 187 188 /** 189 * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS. 190 * However, if you add, add both the language and language+script mappings. 191 */ 192 // Many of the overrides below can be removed once the language/pop/country data is updated. 193 private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] { 194 { "cic", "cic_Latn_US" }, 195 { "cic_Latn", "cic_Latn_US" }, 196 { "eo", "eo_Latn_001" }, 197 { "eo_Latn", "eo_Latn_001" }, 198 { "es", "es_Latn_ES" }, 199 { "es_Latn", "es_Latn_ES" }, 200 { "ff_BF", "ff_Latn_BF" }, 201 { "ff_GM", "ff_Latn_GM" }, 202 { "ff_GH", "ff_Latn_GH" }, 203 { "ff_GW", "ff_Latn_GW" }, 204 { "ff_LR", "ff_Latn_LR" }, 205 { "ff_NE", "ff_Latn_NE" }, 206 { "ff_NG", "ff_Latn_NG" }, 207 { "ff_SL", "ff_Latn_SL" }, 208 { "ff_Adlm", "ff_Adlm_GN" }, 209 { "ia", "ia_Latn_001" }, 210 { "ia_Latn", "ia_Latn_001" }, 211 { "io", "io_Latn_001" }, 212 { "io_Latn", "io_Latn_001" }, 213 { "jbo", "jbo_Latn_001" }, 214 { "jbo_Latn", "jbo_Latn_001" }, 215 { "ku_Arab", "ku_Arab_IQ" }, 216 { "lrc", "lrc_Arab_IR" }, 217 { "lrc_Arab", "lrc_Arab_IR" }, 218 { "man", "man_Latn_GM" }, 219 { "man_Latn", "man_Latn_GM" }, 220 { "mas", "mas_Latn_KE" }, 221 { "mas_Latn", "mas_Latn_KE" }, 222 { "mn", "mn_Cyrl_MN" }, 223 { "mn_Cyrl", "mn_Cyrl_MN" }, 224 { "mro", "mro_Mroo_BD" }, 225 { "mro_BD", "mro_Mroo_BD" }, 226 { "ms_Arab", "ms_Arab_MY" }, 227 { "pap", "pap_Latn_AW" }, 228 { "pap_Latn", "pap_Latn_AW" }, 229 { "prg", "prg_Latn_001" }, 230 { "prg_Latn", "prg_Latn_001" }, 231 { "rif", "rif_Tfng_MA" }, 232 { "rif_Latn", "rif_Latn_MA" }, 233 { "rif_Tfng", "rif_Tfng_MA" }, 234 { "rif_MA", "rif_Tfng_MA" }, 235 { "shi", "shi_Tfng_MA" }, 236 { "shi_Tfng", "shi_Tfng_MA" }, 237 { "shi_MA", "shi_Tfng_MA" }, 238 { "sr_Latn", "sr_Latn_RS" }, 239 { "ss", "ss_Latn_ZA" }, 240 { "ss_Latn", "ss_Latn_ZA" }, 241 { "swc", "swc_Latn_CD" }, 242 { "ti", "ti_Ethi_ET" }, 243 { "ti_Ethi", "ti_Ethi_ET" }, 244 { "und", "en_Latn_US" }, 245 { "und_Adlm", "ff_Adlm_GN" }, 246 { "und_Adlm_GN", "ff_Adlm_GN" }, 247 { "und_Arab", "ar_Arab_EG" }, 248 { "und_Arab_PK", "ur_Arab_PK" }, 249 { "und_Bopo", "zh_Bopo_TW" }, 250 { "und_Deva_FJ", "hif_Deva_FJ" }, 251 { "und_EZ", "de_Latn_EZ" }, 252 { "und_Hani", "zh_Hani_CN" }, 253 { "und_Hani_CN", "zh_Hani_CN" }, 254 { "und_Kana", "ja_Kana_JP" }, 255 { "und_Kana_JP", "ja_Kana_JP" }, 256 { "und_Latn", "en_Latn_US" }, 257 { "und_Latn_ET", "en_Latn_ET" }, 258 { "und_Latn_NE", "ha_Latn_NE" }, 259 { "und_Latn_PH", "fil_Latn_PH" }, 260 { "und_ML", "bm_Latn_ML" }, 261 { "und_Latn_ML", "bm_Latn_ML" }, 262 { "und_MU", "mfe_Latn_MU" }, 263 { "und_NE", "ha_Latn_NE" }, 264 { "und_PH", "fil_Latn_PH" }, 265 { "und_PK", "ur_Arab_PK" }, 266 { "und_SO", "so_Latn_SO" }, 267 { "und_SS", "en_Latn_SS" }, 268 { "und_TK", "tkl_Latn_TK" }, 269 { "und_UN", "en_Latn_UN" }, 270 { "und_005", "pt_Latn_BR" }, 271 { "vo", "vo_Latn_001" }, 272 { "vo_Latn", "vo_Latn_001" }, 273 { "yi", "yi_Hebr_001" }, 274 { "yi_Hebr", "yi_Hebr_001" }, 275 { "yue", "yue_Hant_HK" }, 276 { "yue_Hant", "yue_Hant_HK" }, 277 { "yue_Hans", "yue_Hans_CN" }, 278 { "yue_CN", "yue_Hans_CN" }, 279 { "zh_Hani", "zh_Hani_CN" }, 280 281 { "zh_Bopo", "zh_Bopo_TW" }, 282 { "ccp", "ccp_Cakm_BD" }, 283 { "ccp_Cakm", "ccp_Cakm_BD" }, 284 { "und_Cakm", "ccp_Cakm_BD" }, 285 { "cu_Glag", "cu_Glag_BG" }, 286 { "sd_Khoj", "sd_Khoj_IN" }, 287 { "lif_Limb", "lif_Limb_IN" }, 288 { "grc_Linb", "grc_Linb_GR" }, 289 { "arc_Nbat", "arc_Nbat_JO" }, 290 { "arc_Palm", "arc_Palm_SY" }, 291 { "pal_Phlp", "pal_Phlp_CN" }, 292 { "en_Shaw", "en_Shaw_GB" }, 293 { "sd_Sind", "sd_Sind_IN" }, 294 { "und_Brai", "fr_Brai_FR" }, // hack 295 { "und_Hanb", "zh_Hanb_TW" }, // Special script code 296 { "zh_Hanb", "zh_Hanb_TW" }, // Special script code 297 { "und_Jamo", "ko_Jamo_KR" }, // Special script code 298 299 //{"und_Cyrl_PL", "be_Cyrl_PL"}, 300 301 // {"cr", "cr_Cans_CA"}, 302 // {"hif", "hif_Latn_FJ"}, 303 // {"gon", "gon_Telu_IN"}, 304 // {"lzz", "lzz_Latn_TR"}, 305 // {"lif", "lif_Deva_NP"}, 306 // {"unx", "unx_Beng_IN"}, 307 // {"unr", "unr_Beng_IN"}, 308 // {"ttt", "ttt_Latn_AZ"}, 309 // {"pnt", "pnt_Grek_GR"}, 310 // {"tly", "tly_Latn_AZ"}, 311 // {"tkr", "tkr_Latn_AZ"}, 312 // {"bsq", "bsq_Bass_LR"}, 313 // {"ccp", "ccp_Cakm_BD"}, 314 // {"blt", "blt_Tavt_VN"}, 315 // { "mis_Medf", "mis_Medf_NG" }, 316 317 { "ku_Yezi", "ku_Yezi_GE" }, 318 { "und_EU", "en_Latn_IE" }, 319 }); 320 321 /** 322 * The following supplements the suppress-script. It overrides info from exemplars and the locale info. 323 */ 324 private static String[][] SpecialScripts = { 325 { "zh", "Hans" }, // Hans (not Hani) 326 { "yue", "Hant" }, // Hans (not Hani) 327 { "chk", "Latn" }, // Chuukese (Micronesia) 328 { "fil", "Latn" }, // Filipino (Philippines)" 329 { "ko", "Kore" }, // Korean (North Korea) 330 { "ko_KR", "Kore" }, // Korean (North Korea) 331 { "pap", "Latn" }, // Papiamento (Netherlands Antilles) 332 { "pau", "Latn" }, // Palauan (Palau) 333 { "su", "Latn" }, // Sundanese (Indonesia) 334 { "tet", "Latn" }, // Tetum (East Timor) 335 { "tk", "Latn" }, // Turkmen (Turkmenistan) 336 { "ty", "Latn" }, // Tahitian (French Polynesia) 337 { "ja", "Jpan" }, // Special script for japan 338 { "und", "Latn" }, // Ultimate fallback 339 }; 340 341 private static Map<String, String> localeToScriptCache = new TreeMap<>(); 342 static { 343 for (String language : standardCodes.getAvailableCodes("language")) { 344 Map<String, String> info = standardCodes.getLangData("language", language); 345 String script = info.get("Suppress-Script"); 346 if (script != null) { localeToScriptCache.put(language, script)347 localeToScriptCache.put(language, script); 348 } 349 } 350 for (String[] pair : SpecialScripts) { localeToScriptCache.put(pair[0], pair[1])351 localeToScriptCache.put(pair[0], pair[1]); 352 } 353 } 354 355 private static Map<String, String> FALLBACK_SCRIPTS; 356 static { 357 LanguageTagParser additionLtp = new LanguageTagParser(); 358 Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>(); 359 for (String addition : MAX_ADDITIONS) { 360 additionLtp.set(addition); 361 String lan = additionLtp.getLanguage(); _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())362 _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript()); 363 } 364 FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS); 365 } 366 367 private static int errorCount; 368 main(String[] args)369 public static void main(String[] args) throws IOException { 370 371 printDefaultLanguagesAndScripts(); 372 373 Map<String, String> toMaximized = new TreeMap<>(); 374 375 tryDifferentAlgorithm(toMaximized); 376 377 minimize(toMaximized); 378 379 // HACK TEMP_UNKNOWN_REGION 380 // this is to get around the removal of items with ZZ in minimize. 381 // probably cleaner way to do it, but this provides control over just those we want to retain. 382 Set<String> toRemove = new TreeSet<>(); 383 Map<String, String> toFix = new TreeMap<>(); 384 for (Entry<String, String> entry : toMaximized.entrySet()) { 385 String key = entry.getKey(); 386 String value = entry.getValue(); 387 if (key.contains(TEMP_UNKNOWN_REGION)) { 388 toRemove.add(key); 389 } else if (value.contains(TEMP_UNKNOWN_REGION)) { 390 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION)); 391 } 392 } 393 for (String key : toRemove) { 394 toMaximized.remove(key); 395 } 396 toMaximized.putAll(toFix); 397 398 Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags(); 399 Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab", 400 "ms_Arab_ID"); 401 System.out.println(Joiner.on("\n").join(changes)); 402 403 if (OUTPUT_STYLE == OutputStyle.C_ALT) { 404 doAlt(toMaximized); 405 } 406 407 if (SHOW_ADD) 408 System.out 409 .println("/*" 410 + CldrUtility.LINE_SEPARATOR 411 + " To Maximize:" 412 + 413 CldrUtility.LINE_SEPARATOR 414 + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing." 415 + 416 CldrUtility.LINE_SEPARATOR 417 + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'." 418 + 419 CldrUtility.LINE_SEPARATOR 420 + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions" 421 + 422 CldrUtility.LINE_SEPARATOR 423 + " Try each of the following in order (where the field exists)" 424 + 425 CldrUtility.LINE_SEPARATOR 426 + " Lookup language-script-region. If in the table, return the result + variants" 427 + 428 CldrUtility.LINE_SEPARATOR 429 + " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants" 430 + 431 CldrUtility.LINE_SEPARATOR 432 + " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants" 433 + 434 CldrUtility.LINE_SEPARATOR 435 + " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants" 436 + 437 CldrUtility.LINE_SEPARATOR 438 + 439 CldrUtility.LINE_SEPARATOR 440 + " Example: Input is zh-ZZZZ-SG." 441 + 442 CldrUtility.LINE_SEPARATOR 443 + " Normalize to zh-SG. Lookup in table. No match." 444 + 445 CldrUtility.LINE_SEPARATOR 446 + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG." 447 + 448 CldrUtility.LINE_SEPARATOR 449 + 450 CldrUtility.LINE_SEPARATOR 451 + " To Minimize:" 452 + 453 CldrUtility.LINE_SEPARATOR 454 + " First get max = maximize(input)." 455 + 456 CldrUtility.LINE_SEPARATOR 457 + " Then for trial in {language, language-region, language-script}" 458 + 459 CldrUtility.LINE_SEPARATOR 460 + " If maximize(trial) == max, then return trial." 461 + 462 CldrUtility.LINE_SEPARATOR 463 + " If you don't get a match, return max." 464 + 465 CldrUtility.LINE_SEPARATOR 466 + 467 CldrUtility.LINE_SEPARATOR 468 + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW." 469 + 470 CldrUtility.LINE_SEPARATOR 471 + " zh => zh-Hans-CN. No match, so continue." 472 + 473 CldrUtility.LINE_SEPARATOR 474 + " zh-TW => zh-Hans-TW. Match, so return zh-TW." 475 + 476 CldrUtility.LINE_SEPARATOR 477 + 478 CldrUtility.LINE_SEPARATOR 479 + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language." 480 + 481 CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() + 482 CldrUtility.LINE_SEPARATOR + "*/"); 483 484 final File newLikelySubtags = printLikelySubtags(toMaximized); 485 486 printDefaultContent(toMaximized); 487 488 // Do this here so the two "Copying…" messages show up together. 489 if (OUTPUT_STYLE == OutputStyle.XML) { 490 final File oldLikelySubtags = CLDRConfig.getInstance().getEnglish().getSupplementalFile("likelySubtags.xml"); 491 System.out.println("Copying " + newLikelySubtags + " to " + oldLikelySubtags); 492 oldLikelySubtags.delete(); 493 Files.copy(newLikelySubtags.toPath(), oldLikelySubtags.toPath()); 494 } 495 496 System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR); 497 498 System.exit(errorCount > 0 ? 1 : 0); 499 } 500 501 static class RowData implements Comparable<RowData> { 502 OfficialStatus os; 503 String name; 504 Long pop; 505 RowData(OfficialStatus os, String name, Long pop)506 public RowData(OfficialStatus os, String name, Long pop) { 507 this.os = os; 508 this.name = name; 509 this.pop = pop; 510 } 511 getStatus()512 public OfficialStatus getStatus() { 513 // TODO Auto-generated method stub 514 return os; 515 } 516 getName()517 public CharSequence getName() { 518 // TODO Auto-generated method stub 519 return name; 520 } 521 getLiteratePopulation()522 public Long getLiteratePopulation() { 523 // TODO Auto-generated method stub 524 return pop; 525 } 526 527 @Override compareTo(RowData o)528 public int compareTo(RowData o) { 529 // TODO Auto-generated method stub 530 int result = os.compareTo(o.os); 531 if (result != 0) return -result; 532 long result2 = pop - o.pop; 533 if (result2 != 0) return result2 < 0 ? 1 : -1; 534 return name.compareTo(o.name); 535 } 536 537 @Override equals(Object o)538 public boolean equals(Object o) { 539 return 0 == compareTo((RowData) o); 540 } 541 542 @Override hashCode()543 public int hashCode() { 544 throw new UnsupportedOperationException(); 545 } 546 } 547 printDefaultLanguagesAndScripts()548 private static void printDefaultLanguagesAndScripts() { 549 550 final int minTotalPopulation = 10000000; 551 final int minTerritoryPopulation = 1000000; 552 final double minTerritoryPercent = 1.0 / 3; 553 Map<String, Set<RowData>> languageToReason = new TreeMap<>(); 554 Counter<String> languageToLiteratePopulation = new Counter<>(); 555 NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH); 556 nf.setGroupingUsed(true); 557 LanguageTagParser ltp = new LanguageTagParser(); 558 LikelySubtags likelySubtags = new LikelySubtags(); 559 /* 560 * A. X is a qualified language**, and at least one of the following is true: 561 * 562 * 1. X is has official status* in any country 563 * 2. X exceeds a threshold population† of literate users worldwide: 1M 564 * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†. 565 * 566 * B. X is an exception explicitly approved by the committee or X has minimal 567 * language coverage‡ in CLDR itself. 568 * C. The language is in the CLDR-target locales 569 */ 570 OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official; 571 Map<String, String> languages = new TreeMap<>(); 572 for (String language : standardCodes.getAvailableCodes("language")) { 573 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 574 String result = english.getStringValue(path); 575 if (result != null) { 576 languages.put(language, result); 577 } 578 } 579 580 if (SHOW_ALL_LANGUAGE_CODES) { 581 for (String language : languages.keySet()) { 582 System.out.println(language + "\t" + languages.get(language)); 583 } 584 } else { 585 System.out.println("- GenerateMaximalLocales.java: SHOW_ALL_LANGUAGE_CODES=true to show all language codes"); 586 } 587 588 // also CLDR-target locales 589 final Set<String> CLDRMainLanguages = new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr)); 590 591 for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 592 PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory); 593 double territoryPopulation = territoryPop.getLiteratePopulation(); 594 for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) { 595 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript, 596 territory); 597 ltp.set(languageScript); 598 String language = ltp.getLanguage(); 599 // if (ltp.getScript().isEmpty()) { 600 // String max = likelySubtags.maximize(languageScript); 601 // if (max != null) { 602 // ltp.set(max).setRegion(""); 603 // languageScript = ltp.toString(); 604 // } 605 // } 606 boolean add = false; 607 // #1 608 OfficialStatus status = popData.getOfficialStatus(); 609 if (status.compareTo(minimalStatus) >= 0) { 610 add = true; 611 } 612 long literatePopulation = getWritingPopulation(popData); 613 // #2 614 languageToLiteratePopulation.add(language, literatePopulation); 615 // #3 616 if (literatePopulation > minTerritoryPopulation 617 && literatePopulation > minTerritoryPercent * territoryPopulation) { 618 add = true; 619 } 620 if (add == false && CLDRMainLanguages.contains(language)) { 621 add = true; 622 } 623 if (add) { 624 add(languageToReason, language, territory, status, literatePopulation); 625 // Add the containing regions 626 for (String container : Containment.leafToContainer(territory)) { 627 add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation); 628 } 629 } 630 } 631 } 632 // #2, now that we have the data 633 for (String language : languageToLiteratePopulation.keySet()) { 634 long totalPop = languageToLiteratePopulation.getCount(language); 635 if (totalPop > minTotalPopulation) { 636 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop); 637 } 638 } 639 640 // Specials 641 add(languageToReason, "und", "001", OfficialStatus.unknown, 0); 642 643 // for (String language : Iso639Data.getAvailable()) { 644 // Scope scope = Iso639Data.getScope(language); 645 // Type type = Iso639Data.getType(language); 646 // if (scope == Scope.Special) { 647 // add(languageToReason, language, "001", OfficialStatus.unknown, -1); 648 // } 649 // } 650 // print them 651 652 System.out.println("Detailed - Including:\t" + languageToReason.size()); 653 654 if (!SHOW_DETAILED) { 655 System.out.println("- GenerateMaximalLocales.java: SHOW_DETAILED=true to show more details"); 656 } else { 657 for (String language : languageToReason.keySet()) { 658 Set<RowData> reasons = languageToReason.get(language); 659 660 RowData lastReason = reasons.iterator().next(); 661 662 System.out.append(language) 663 .append("\t") 664 .append(english.getName(language)) 665 .append("\t") 666 .append(lastReason.getStatus().toShortString()) 667 .append("\t") 668 .append(nf.format(languageToLiteratePopulation.getCount(language))); 669 for (RowData reason : reasons) { 670 String status = reason.getStatus().toShortString(); 671 System.out.append("\t") 672 .append(status) 673 .append("-") 674 .append(reason.getName()) 675 .append("-") 676 .append(nf.format(reason.getLiteratePopulation())); 677 } 678 System.out.append("\n"); 679 } 680 } 681 682 // now list them 683 684 Set<String> others = new TreeSet<>(); 685 others.addAll(standardCodes.getGoodAvailableCodes("language")); 686 others.removeAll(languageToReason.keySet()); 687 System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size()); 688 if (SHOW_INCLUDED_EXCLUDED) { 689 showLanguages(languageToReason.keySet(), languageToReason); 690 } 691 System.out.println("\nExcluded Languages:\t" + others.size()); 692 if (SHOW_INCLUDED_EXCLUDED) { 693 showLanguages(others, languageToReason); 694 } else { 695 System.out.println(" - GenerateMaximalLocales.java: set SHOW_INCLUDED_EXCLUDED=true to show reason details"); 696 } 697 } 698 getWritingPopulation(PopulationData popData)699 private static long getWritingPopulation(PopulationData popData) { 700 final double writingPopulation = popData.getWritingPopulation(); 701 if (!Double.isNaN(writingPopulation)) { 702 return (long) writingPopulation; 703 } 704 return (long) popData.getLiteratePopulation(); 705 } 706 showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)707 private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) { 708 Set<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH)); 709 for (String language : others) { 710 sorted.add(getLanguageName(language, languageToReason)); 711 } 712 char last = 0; 713 for (String language : sorted) { 714 final char curr = language.charAt(0); 715 if (last != curr) { 716 System.out.println(); 717 } else if (last != '\u0000') { 718 System.out.print(", "); 719 } 720 System.out.print(language); 721 last = curr; 722 } 723 System.out.println(); 724 } 725 getLanguageName(String language, Map<String, Set<RowData>> languageToReason)726 private static String getLanguageName(String language, 727 Map<String, Set<RowData>> languageToReason) { 728 OfficialStatus best = OfficialStatus.unknown; 729 Set<RowData> reasons = languageToReason.get(language); 730 if (reasons != null) { 731 for (RowData reason : reasons) { 732 final OfficialStatus currentStatus = reason.getStatus(); 733 if (best.compareTo(currentStatus) < 0) { 734 best = currentStatus; 735 } 736 } 737 } 738 String status = best.toShortString(); 739 Scope scope = Iso639Data.getScope(language); 740 if (scope == Scope.Special) { 741 status = "S"; 742 } 743 String languageFormatted = english.getName(language) + " [" + language + "]-" + status; 744 return languageFormatted; 745 } 746 add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)747 private static void add(Map<String, Set<RowData>> languageToReason, String language, 748 String territoryRaw, OfficialStatus status, long population) { 749 String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]"; 750 Set<RowData> set = languageToReason.get(language); 751 if (set == null) { 752 languageToReason.put(language, set = new TreeSet<>()); 753 } 754 set.add(new RowData(status, territory, population)); 755 } 756 757 /** 758 * In computing the defaultContents, no and nb require special handling. 759 */ 760 static final Map<String, String> SPECIAL_CHILD_TO_PARENT = ImmutableMap.of("nb", "no", "nb_NO", "nb"); 761 762 /* 763 * Compute the defaultContent values for supplemental data. 764 * It uses the maximization data and the simpleParent (truncation). 765 * We can't use the normal "getParent" because that messes up the logic 766 * used to handle inconsistencies in scripts in CLDR.<br> 767 * That is, there are three situations: <ul> 768 * <li>all children have explicit scripts; </li> 769 * <li>no children have scripts; and </li> 770 * <li>some do and some don't</li></ul> 771 */ 772 printDefaultContent(Map<String, String> toMaximized)773 private static void printDefaultContent(Map<String, String> toMaximized) throws IOException { 774 775 Set<String> defaultLocaleContent = new TreeSet<>(); 776 777 // go through all the cldr locales, and add default contents 778 // now computed from toMaximized 779 Set<String> available = factory.getAvailable(); 780 Relation<String, String> toSimpleChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 781 LanguageTagParser ltp = new LanguageTagParser(); 782 783 // System.out.println(maximize("az_Latn_AZ", toMaximized)); 784 Set<String> hasSimpleChildWithScript = new TreeSet<>(); 785 786 // first get a mapping to children 787 for (String locale : available) { 788 if (locale.equals("root")) { 789 continue; 790 } 791 if (ltp.set(locale).getVariants().size() != 0) { 792 continue; 793 } 794 String parent = SPECIAL_CHILD_TO_PARENT.get(locale); 795 if (parent == null) { 796 parent = LocaleIDParser.getSimpleParent(locale); // we can't use the regular getParent (see above) 797 } 798 799 if (ltp.getScript().length() != 0) { 800 hasSimpleChildWithScript.add(parent); 801 } 802 if (parent.equals("root")) { 803 continue; 804 } 805 toSimpleChildren.put(parent, locale); 806 } 807 808 // Suppress script for locales for which we only have one locale in common/main. See ticket #7834. 809 Set<String> suppressScriptLocales = new HashSet<>(Arrays.asList( 810 "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", 811 "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE", 812 "blt_VN", 813 "hi_IN", 814 "nv_US", 815 "doi_IN" 816 )); 817 818 // if any have a script, then throw out any that don't have a script (unless they're specifically included.) 819 Set<String> toRemove = new TreeSet<>(); 820 for (String locale : hasSimpleChildWithScript) { 821 toRemove.clear(); 822 Set<String> children = toSimpleChildren.getAll(locale); 823 for (String child : children) { 824 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) { 825 toRemove.add(child); 826 } 827 } 828 if (toRemove.size() != 0) { 829 System.out.println("\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children); 830 toSimpleChildren.removeAll(locale, toRemove); 831 } 832 } 833 834 // we add a child as a default locale if it has the same maximization 835 main: for (String locale : toSimpleChildren.keySet()) { 836 String maximized = maximize(locale, toMaximized); 837 if (maximized == null) { 838 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale); 839 continue; 840 } 841 Set<String> children = toSimpleChildren.getAll(locale); 842 Map<String, String> debugStuff = new TreeMap<>(); 843 for (String child : children) { 844 String maximizedChild = maximize(child, toMaximized); 845 if (maximized.equals(maximizedChild)) { 846 defaultLocaleContent.add(child); 847 continue main; 848 } 849 debugStuff.put(child, maximizedChild); 850 } 851 if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized 852 + "\tin\t" + debugStuff); 853 } 854 855 for (String specialChild : SPECIAL_CHILD_TO_PARENT.keySet()) { 856 defaultLocaleContent.add(specialChild); 857 } 858 defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. (old sandbox) 859 defaultLocaleContent.remove("mul_ZZ"); // mul_ZZ isn't ever a real locale. 860 861 showDefaultContentDifferencesAndFix(defaultLocaleContent); 862 863 final File genSuppDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental"); 864 final File genSuppMetadataFile = new File(genSuppDir, "supplementalMetadata.xml"); 865 final File oldSuppMetadataFile = new File(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml"); 866 867 try ( 868 PrintWriter genFile = FileUtilities.openUTF8Writer(genSuppMetadataFile); 869 BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSuppMetadataFile);) { 870 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), genFile, false); 871 872 String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t"; 873 String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep, 874 PatternCache.get("(\\S)\\S*").matcher(""), 80); 875 876 genFile.println("\t\t<defaultContent locales=\"" + broken + "\""); 877 genFile.println("\t\t/>"); 878 879 // genFile.println("</supplementalData>"); 880 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching > 881 CldrUtility.copyUpTo(oldFile, null, genFile, true); // copy the rest 882 } 883 884 // Move it into place 885 System.out.println("Copying generated " + genSuppMetadataFile + " to " + oldSuppMetadataFile); 886 oldSuppMetadataFile.delete(); 887 Files.copy(genSuppMetadataFile.toPath(), oldSuppMetadataFile.toPath()); 888 } 889 890 private static class MaxData { 891 Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 892 Map<String, Counter<String>> languagesToScripts = new TreeMap<>(); 893 Map<String, Counter<String>> languagesToRegions = new TreeMap<>(); 894 895 Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 896 Map<String, Counter<String>> scriptsToLanguages = new TreeMap<>(); 897 Map<String, Counter<String>> scriptsToRegions = new TreeMap<>(); 898 899 Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 900 Map<String, Counter<String>> regionsToLanguages = new TreeMap<>(); 901 Map<String, Counter<String>> regionsToScripts = new TreeMap<>(); 902 903 Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<>(); 904 Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of( 905 new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class); 906 907 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of( 908 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 909 TreeSet.class); 910 Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of( 911 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 912 TreeSet.class); 913 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of( 914 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 915 TreeSet.class); 916 917 /** 918 * Add population information. "order" is the negative of the population (makes the first be the highest). 919 * @param language 920 * @param script 921 * @param region 922 * @param order 923 */ add(String language, String script, String region, Double order)924 void add(String language, String script, String region, Double order) { 925 if (SHOW_ADD && language.equals("mis")) { 926 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order); 927 } 928 languages.put(language, Row.of(order, script, region)); 929 // addCounter(languagesToScripts, language, script, order); 930 // addCounter(languagesToRegions, language, region, order); 931 932 scripts.put(script, Row.of(order, language, region)); 933 // addCounter(scriptsToLanguages, script, language, order); 934 // addCounter(scriptsToRegions, script, region, order); 935 936 regions.put(region, Row.of(order, language, script)); 937 // addCounter(regionsToLanguages, region, language, order); 938 // addCounter(regionsToScripts, region, script, order); 939 940 languageScripts.put(Row.of(language, script), Row.of(order, region)); 941 scriptRegions.put(Row.of(script, region), Row.of(order, language)); 942 languageRegions.put(Row.of(language, region), Row.of(order, script)); 943 944 Set<String> containerSet = Containment.leafToContainer(region); 945 if (containerSet != null) { 946 for (String container : containerSet) { 947 948 containersToLangRegion.put(container, Row.of(order, language, script, region)); 949 Counter<R2<String, String>> data = containersToLanguage.get(container); 950 if (data == null) { 951 containersToLanguage.put(container, data = new Counter<>()); 952 } 953 data.add(Row.of(language, script), (long) (double) order); 954 955 } 956 } 957 958 if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order); 959 } 960 // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) { 961 // Counter<String> counter = map.get(key); 962 // if (counter == null) { 963 // map.put(key, counter = new Counter<String>()); 964 // } 965 // counter.add(key2, count.longValue()); 966 // } 967 } 968 969 private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000; 970 private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20; 971 private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000; 972 private static final double UNOFFICIAL_SCALE_DOWN = 0.2; 973 974 private static NumberFormat percent = NumberFormat.getPercentInstance(); 975 private static NumberFormat number = NumberFormat.getIntegerInstance(); 976 tryDifferentAlgorithm(Map<String, String> toMaximized)977 private static void tryDifferentAlgorithm(Map<String, String> toMaximized) { 978 // we are going to try a different approach. 979 // first gather counts for maximized values 980 // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap(); 981 MaxData maxData = new MaxData(); 982 Set<String> cldrLocales = factory.getAvailable(); 983 Set<String> otherTerritories = new TreeSet<>(standardCodes.getGoodAvailableCodes("territory")); 984 985 // process all the information to get the top values for each triple. 986 // each of the combinations of 1 or 2 components gets to be a key. 987 for (String region : supplementalData.getTerritoriesWithPopulationData()) { 988 otherTerritories.remove(region); 989 PopulationData regionData = supplementalData.getPopulationDataForTerritory(region); 990 final double literateTerritoryPopulation = regionData.getLiteratePopulation(); 991 // we need any unofficial language to meet a certain absolute size requirement and proportion size 992 // requirement. 993 // so the bar is x percent of the population, reset up to y absolute size. 994 double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION; 995 if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) { 996 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE; 997 } 998 999 for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) { 1000 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region); 1001 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation(); 1002 double order = -literatePopulation; // negative so we get the inverse order 1003 1004 if (data.getOfficialStatus() == OfficialStatus.unknown) { 1005 final String locale = writtenLanguage + "_" + region; 1006 if (literatePopulation >= minimalLiteratePopulation) { 1007 // ok, skip 1008 } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) { 1009 // ok, skip 1010 } else { 1011 // if (SHOW_ADD) 1012 // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" 1013 // + english.getName(locale) 1014 // + "\t-- too small:\t" + number.format(literatePopulation)); 1015 // continue; 1016 } 1017 order *= UNOFFICIAL_SCALE_DOWN; 1018 if (SHOW_ADD) 1019 System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t" 1020 + english.getName(locale) 1021 + "\t" + number.format(literatePopulation) 1022 + "\t" + percent.format(literatePopulation / literateTerritoryPopulation) 1023 + (cldrLocales.contains(locale) ? "\tin-CLDR" : "")); 1024 } 1025 String script; 1026 String language = writtenLanguage; 1027 final int pos = writtenLanguage.indexOf('_'); 1028 if (pos > 0) { 1029 language = writtenLanguage.substring(0, pos); 1030 script = writtenLanguage.substring(pos + 1); 1031 } else { 1032 script = getScriptForLocale2(language); 1033 } 1034 maxData.add(language, script, region, order); 1035 } 1036 } 1037 1038 LanguageTagParser additionLtp = new LanguageTagParser(); 1039 1040 for (String addition : MAX_ADDITIONS) { 1041 additionLtp.set(addition); 1042 String lan = additionLtp.getLanguage(); 1043 Set<R3<Double, String, String>> key = maxData.languages.get(lan); 1044 if (key == null) { 1045 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0); 1046 } else { 1047 int debug = 0; 1048 } 1049 } 1050 1051 for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) { 1052 String language = entry.getKey(); 1053 final Collection<String> values = entry.getValue(); 1054 if (values.size() != 1) { 1055 continue; // skip, no either way 1056 } 1057 Set<R3<Double, String, String>> old = maxData.languages.get(language); 1058 if (!maxData.languages.containsKey(language)) { 1059 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); 1060 } 1061 } 1062 1063 // add others, with English default 1064 for (String region : otherTerritories) { 1065 if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS 1066 maxData.add("en", "Latn", region, 1.0); 1067 } 1068 1069 // get a reverse mapping, so that we can add the aliases 1070 1071 Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo() 1072 .get("language"); 1073 for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) { 1074 String reason = str.getValue().get1(); 1075 if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) { 1076 continue; 1077 } 1078 List<String> replacements = str.getValue().get0(); 1079 if (replacements == null) { 1080 continue; 1081 } 1082 String goodLanguage = replacements.get(0); 1083 1084 String badLanguage = str.getKey(); 1085 if (badLanguage.contains("_")) { 1086 continue; 1087 } 1088 if (deprecatedISONotInLST.contains(badLanguage)) { 1089 continue; 1090 } 1091 Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage); 1092 if (goodLanguageData == null) { 1093 continue; 1094 } 1095 R3<Double, String, String> value = goodLanguageData.iterator().next(); 1096 final String script = value.get1(); 1097 final String region = value.get2(); 1098 maxData.add(badLanguage, script, region, 1.0); 1099 System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason); 1100 } 1101 1102 // now, get the best for each one 1103 for (String language : maxData.languages.keySet()) { 1104 R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next(); 1105 final Comparable<String> script = value.get1(); 1106 final Comparable<String> region = value.get2(); 1107 add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", LocaleOverride.REPLACE_EXISTING, 1108 SHOW_ADD); 1109 } 1110 for (String language : maxData.languagesToScripts.keySet()) { 1111 String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next(); 1112 add(language, language + "_" + script, toMaximized, "L->S", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1113 } 1114 for (String language : maxData.languagesToRegions.keySet()) { 1115 String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next(); 1116 add(language, language + "_" + region, toMaximized, "L->R", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1117 } 1118 1119 for (String script : maxData.scripts.keySet()) { 1120 R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next(); 1121 final Comparable<String> language = value.get1(); 1122 final Comparable<String> region = value.get2(); 1123 add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR", 1124 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1125 } 1126 for (String script : maxData.scriptsToLanguages.keySet()) { 1127 String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next(); 1128 add("und_" + script, language + "_" + script, toMaximized, "S->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1129 } 1130 for (String script : maxData.scriptsToRegions.keySet()) { 1131 String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next(); 1132 add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", LocaleOverride.REPLACE_EXISTING, 1133 SHOW_ADD); 1134 } 1135 1136 for (String region : maxData.regions.keySet()) { 1137 R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next(); 1138 final Comparable<String> language = value.get1(); 1139 final Comparable<String> script = value.get2(); 1140 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS", 1141 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1142 } 1143 for (String region : maxData.regionsToLanguages.keySet()) { 1144 String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next(); 1145 add("und_" + region, language + "_" + region, toMaximized, "R->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1146 } 1147 for (String region : maxData.regionsToScripts.keySet()) { 1148 String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next(); 1149 add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", LocaleOverride.REPLACE_EXISTING, 1150 SHOW_ADD); 1151 } 1152 1153 for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) { 1154 String region = containerAndInfo.getKey(); 1155 if (region.equals("001")) { 1156 continue; 1157 } 1158 Counter<R2<String, String>> data = containerAndInfo.getValue(); 1159 Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true); 1160 if (SHOW_CONTAINERS) { // debug 1161 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null))); 1162 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region)); 1163 } 1164 R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative 1165 final Comparable<String> language = value.get0(); 1166 final Comparable<String> script = value.get1(); 1167 1168 // fix special cases like es-419, where a locale exists. 1169 // for those cases, what we add as output is the container. Otherwise the region. 1170 Set<String> skipLanguages = cldrContainerToLanguages.get(region); 1171 if (skipLanguages != null 1172 && skipLanguages.contains(language)) { 1173 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS", 1174 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1175 continue; 1176 } 1177 1178 // we now have the best language and script. Find the best region for that 1179 for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) { 1180 final Comparable<String> language2 = e.get1(); 1181 final Comparable<String> script2 = e.get2(); 1182 if (language2.equals(language) && script2.equals(script)) { 1183 add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS", 1184 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1185 break; 1186 } 1187 } 1188 } 1189 1190 for (R2<String, String> languageScript : maxData.languageScripts.keySet()) { 1191 R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next(); 1192 final Comparable<String> language = languageScript.get0(); 1193 final Comparable<String> script = languageScript.get1(); 1194 final Comparable<String> region = value.get1(); 1195 add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R", 1196 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1197 } 1198 1199 for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) { 1200 R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next(); 1201 final Comparable<String> script = scriptRegion.get0(); 1202 final Comparable<String> region = scriptRegion.get1(); 1203 final Comparable<String> language = value.get1(); 1204 add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L", 1205 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1206 } 1207 1208 for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) { 1209 R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next(); 1210 final Comparable<String> language = languageRegion.get0(); 1211 final Comparable<String> region = languageRegion.get1(); 1212 final Comparable<String> script = value.get1(); 1213 add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S", 1214 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1215 } 1216 1217 // get the script info from metadata as fallback 1218 1219 1220 TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts()); 1221 for (String script : sorted) { 1222 Info i = ScriptMetadata.getInfo(script); 1223 String likelyLanguage = i.likelyLanguage; 1224 if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) { 1225 likelyLanguage = "und"; 1226 } 1227 String originCountry = i.originCountry; 1228 final String result = likelyLanguage + "_" + script + "_" + originCountry; 1229 add("und_" + script, result, toMaximized, "S->LR•", 1230 LocaleOverride.KEEP_EXISTING, SHOW_ADD); 1231 add(likelyLanguage, result, toMaximized, "L->SR•", 1232 LocaleOverride.KEEP_EXISTING, SHOW_ADD); 1233 } 1234 1235 // add overrides 1236 for (String key : LANGUAGE_OVERRIDES.keySet()) { 1237 add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", LocaleOverride.REPLACE_EXISTING, true); 1238 } 1239 1240 // Make sure that the mapping is Idempotent. If we have A ==> B, we must never have B ==> C 1241 // We run this check until we get no problems. 1242 Set<List<String>> problems = new HashSet<>(); 1243 1244 while (true) { 1245 problems.clear(); 1246 for (Entry<String, String> entry : toMaximized.entrySet()) { 1247 String source = entry.getKey(); 1248 String target = entry.getValue(); 1249 if (target.contains("_Zzzz") || target.contains("_ZZ")) { // these are special cases 1250 continue; 1251 } 1252 String idempotentCandidate = LikelySubtags.maximize(target, toMaximized); 1253 1254 if (idempotentCandidate == null) { 1255 System.out.println("Can't maximize " + target); 1256 } else if (!idempotentCandidate.equals(target)) { 1257 problems.add(ImmutableList.of(source, target, idempotentCandidate)); 1258 } 1259 } 1260 if (problems.isEmpty()) { 1261 break; 1262 } 1263 for (List<String> row : problems) { 1264 System.out.println("Idempotence: dropping mapping " + row.get(0) + " to " + row.get(1) + " since the target maps further to " + row.get(2)); 1265 toMaximized.remove(row.get(0)); 1266 } 1267 } 1268 } 1269 shorten(Object data)1270 public static String shorten(Object data) { 1271 String info = data.toString(); 1272 if (info.length() > 255) { 1273 info = info.substring(0, 127) + "…"; 1274 } 1275 return info; 1276 } 1277 doAlt(Map<String, String> toMaximized)1278 private static void doAlt(Map<String, String> toMaximized) { 1279 // TODO Auto-generated method stub 1280 Map<String, String> temp = new TreeMap<>(); 1281 for (String locale : toMaximized.keySet()) { 1282 String target = toMaximized.get(locale); 1283 temp.put(toAlt(locale, true), toAlt(target, true)); 1284 } 1285 toMaximized.clear(); 1286 toMaximized.putAll(temp); 1287 } 1288 maximize(String languageTag, Map<String, String> toMaximized)1289 public static String maximize(String languageTag, Map<String, String> toMaximized) { 1290 LanguageTagParser ltp = new LanguageTagParser(); 1291 1292 // clean up the input by removing Zzzz, ZZ, and changing "" into und. 1293 ltp.set(languageTag); 1294 String language = ltp.getLanguage(); 1295 String region = ltp.getRegion(); 1296 String script = ltp.getScript(); 1297 boolean changed = false; 1298 if (language.equals("")) { 1299 ltp.setLanguage(language = "und"); 1300 changed = true; 1301 } 1302 if (region.equals(UNKNOWN_SCRIPT)) { 1303 ltp.setScript(script = ""); 1304 changed = true; 1305 } 1306 if (ltp.getRegion().equals(UNKNOWN_REGION)) { 1307 ltp.setRegion(region = ""); 1308 changed = true; 1309 } 1310 if (changed) { 1311 languageTag = ltp.toString(); 1312 } 1313 // check whole 1314 String result = toMaximized.get(languageTag); 1315 if (result != null) { 1316 return result; 1317 } 1318 // try empty region 1319 if (region.length() != 0) { 1320 result = toMaximized.get(ltp.setRegion("").toString()); 1321 if (result != null) { 1322 return ltp.set(result).setRegion(region).toString(); 1323 } 1324 ltp.setRegion(region); // restore 1325 } 1326 // try empty script 1327 if (script.length() != 0) { 1328 result = toMaximized.get(ltp.setScript("").toString()); 1329 if (result != null) { 1330 return ltp.set(result).setScript(script).toString(); 1331 } 1332 // try empty script and region 1333 if (region.length() != 0) { 1334 result = toMaximized.get(ltp.setRegion("").toString()); 1335 if (result != null) { 1336 return ltp.set(result).setScript(script).setRegion(region).toString(); 1337 } 1338 } 1339 } 1340 if (!language.equals("und") && script.length() != 0 && region.length() != 0) { 1341 return languageTag; // it was ok, and we couldn't do anything with it 1342 } 1343 return null; // couldn't maximize 1344 } 1345 minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1346 public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) { 1347 if (input.equals("nb_Latn_SJ")) { 1348 System.out.print(""); // debug 1349 } 1350 String maximized = maximize(input, toMaximized); 1351 if (maximized == null) { 1352 return null; // failed 1353 } 1354 LanguageTagParser ltp = new LanguageTagParser().set(maximized); 1355 String language = ltp.getLanguage(); 1356 String region = ltp.getRegion(); 1357 String script = ltp.getScript(); 1358 // try building up from shorter to longer, and find the first that matches 1359 // could be more optimized, but for this code we want simplest 1360 String[] trials = { language, 1361 language + TAG_SEPARATOR + (favorRegion ? region : script), 1362 language + TAG_SEPARATOR + (!favorRegion ? region : script) }; 1363 for (String trial : trials) { 1364 String newMaximized = maximize(trial, toMaximized); 1365 if (maximized.equals(newMaximized)) { 1366 return trial; 1367 } 1368 } 1369 return maximized; 1370 } 1371 1372 // /** 1373 // * Verify that we can map from each language, script, and country to something. 1374 // * @param toMaximized 1375 // */ 1376 // private static void checkConsistency(Map<String, String> toMaximized) { 1377 // Map<String,String> needMappings = new TreeMap(); 1378 // LanguageTagParser parser = new LanguageTagParser(); 1379 // for (String maximized : new TreeSet<String>(toMaximized.values())) { 1380 // parser.set(maximized); 1381 // final String language = parser.getLanguage(); 1382 // final String script = parser.getScript(); 1383 // final String region = parser.getRegion(); 1384 // if (language.length() == 0 || script.length() == 0 || region.length() == 0) { 1385 // failure(" { \"" + maximized + "\", \"" + maximized + "\" }, // " + english.getName(maximized) + 1386 // "\t\tFailed-Consistency"); 1387 // continue; 1388 // } 1389 // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency"); 1390 // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency"); 1391 // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1392 // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency"); 1393 // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1394 // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency"); 1395 // } 1396 // toMaximized.putAll(needMappings); 1397 // } 1398 1399 // private static void failure(String string) { 1400 // System.out.println(string); 1401 // errorCount++; 1402 // } 1403 1404 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String> 1405 // otherToCheck, String kind) { 1406 // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind); 1407 // } 1408 1409 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey, 1410 // Set<String> skipValue, String kind) { 1411 // if (!key.equals(value) 1412 // && !toAdd.containsKey(key) 1413 // && (skipKey == null || !skipKey.contains(key)) 1414 // && (skipValue == null || !skipValue.contains(value))) { 1415 // add(key, value, toAdd, kind); 1416 // } 1417 // } 1418 1419 enum LocaleOverride { 1420 KEEP_EXISTING, REPLACE_EXISTING 1421 } 1422 add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, boolean showAction)1423 private static void add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, 1424 boolean showAction) { 1425 if (SHOW_ADD && key.startsWith("mis")) { 1426 int debug = 1; 1427 } 1428 if (key.equals(DEBUG_ADD_KEY)) { 1429 System.out.println("*debug*"); 1430 } 1431 String oldValue = toAdd.get(key); 1432 if (oldValue == null) { 1433 if (showAction) { 1434 System.out.println("\tAdding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind); 1435 } 1436 } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) { 1437 // if (showAction) { 1438 // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind); 1439 // } 1440 return; 1441 } else { 1442 if (showAction) { 1443 System.out.println("\tReplacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind); 1444 } 1445 } 1446 toAdd.put(key, value); 1447 } 1448 getName(String value)1449 private static String getName(String value) { 1450 return ConvertLanguageData.getLanguageCodeAndName(value); 1451 } 1452 printLikelySubtags(Map<String, String> fluffup)1453 private static File printLikelySubtags(Map<String, String> fluffup) throws IOException { 1454 final File genDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental"); 1455 final File genFile = new File(genDir, "likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt")); 1456 System.out.println("Writing to " + genFile); 1457 1458 try(PrintWriter out = FileUtilities.openUTF8Writer(genFile)) { 1459 String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " "; 1460 String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {" 1461 : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR 1462 + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">" 1463 + CldrUtility.LINE_SEPARATOR 1464 + "<!--" 1465 + CldrUtility.LINE_SEPARATOR 1466 + CldrUtility.getCopyrightString() 1467 + CldrUtility.LINE_SEPARATOR 1468 + "-->" 1469 + CldrUtility.LINE_SEPARATOR 1470 + "<!--" 1471 + CldrUtility.LINE_SEPARATOR 1472 + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR 1473 + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR 1474 + "not be patched by hand, as any changes made in that fashion may be lost." 1475 + CldrUtility.LINE_SEPARATOR 1476 + "-->" 1477 + CldrUtility.LINE_SEPARATOR 1478 + "<supplementalData>" + CldrUtility.LINE_SEPARATOR 1479 + " <version number=\"$" + 1480 "Revision$\"/>" + CldrUtility.LINE_SEPARATOR 1481 + " <likelySubtags>"; 1482 String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};" 1483 : " </likelySubtags>" + CldrUtility.LINE_SEPARATOR 1484 + "</supplementalData>"; 1485 out.println(header); 1486 boolean first = true; 1487 Set<String> keys = new TreeSet<>(new LocaleStringComparator()); 1488 keys.addAll(fluffup.keySet()); 1489 for (String printingLocale : keys) { 1490 String printingTarget = fluffup.get(printingLocale); 1491 String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing 1492 + printingName(printingTarget, spacing); 1493 1494 if (OUTPUT_STYLE == OutputStyle.XML) { 1495 out.println("\t\t<likelySubtag from=\"" + printingLocale + 1496 "\" to=\"" + printingTarget + "\"" + 1497 "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->"); 1498 } else { 1499 if (first) { 1500 first = false; 1501 } else { 1502 out.print(","); 1503 } 1504 if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { 1505 comment = printingName(printingLocale, spacing) + SEPARATOR + " // " + spacing + "=>" + spacing 1506 + printingName(printingTarget, spacing); 1507 } 1508 out.print( 1509 " {" 1510 + SEPARATOR + " // " + comment 1511 + SEPARATOR + " \"" + printingLocale + "\"," 1512 + SEPARATOR + " \"" + printingTarget + "\"" 1513 + CldrUtility.LINE_SEPARATOR + " }"); 1514 } 1515 } 1516 out.println(footer); 1517 out.close(); 1518 } 1519 return genFile; 1520 } 1521 printingName(String locale, String spacing)1522 public static String printingName(String locale, String spacing) { 1523 if (locale == null) { 1524 return null; 1525 } 1526 LanguageTagParser parser = new LanguageTagParser().set(locale); 1527 String lang = parser.getLanguage(); 1528 String script = parser.getScript(); 1529 String region = parser.getRegion(); 1530 return "{" + spacing + 1531 (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing + 1532 (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing 1533 + 1534 (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing 1535 + "}"; 1536 } 1537 1538 private static final String[][] ALT_REVERSAL = { 1539 //{ "no", "nb" }, 1540 //{ "nb", "no" }, 1541 { "he", "iw" }, 1542 { "iw", "he" }, 1543 }; 1544 toAlt(String locale, boolean change)1545 public static String toAlt(String locale, boolean change) { 1546 if (!change || locale == null) { 1547 return locale; 1548 } 1549 String firstTag = getFirstTag(locale); 1550 for (String[] pair : ALT_REVERSAL) { 1551 if (firstTag.equals(pair[0])) { 1552 locale = pair[1] + locale.substring(pair[1].length()); 1553 break; 1554 } 1555 } 1556 locale = locale.replace("_", "-"); 1557 return locale; 1558 } 1559 getFirstTag(String locale)1560 private static String getFirstTag(String locale) { 1561 int pos = locale.indexOf('_'); 1562 return pos < 0 ? locale : locale.substring(0, pos); 1563 } 1564 1565 // private static Map<String, String> getBackMapping(Map<String, String> fluffup) { 1566 // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR); 1567 // for (String source : fluffup.keySet()) { 1568 // if (source.startsWith("und")) { 1569 // continue; 1570 // } 1571 // String maximized = fluffup.get(source); 1572 // backMap.put(maximized, source); // put in right order 1573 // } 1574 // Map<String,String> returnBackMap = new TreeMap(); 1575 // for (String maximized : backMap.keySet()) { 1576 // final Set<String> all = backMap.getAll(maximized); 1577 // final String minimized = all.iterator().next(); 1578 // returnBackMap.put(maximized, minimized); 1579 // } 1580 // return returnBackMap; 1581 // } 1582 1583 /** 1584 * Language tags are presumed to share the first language, except possibly "und". Best is least 1585 */ 1586 // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() { 1587 // LanguageTagParser p1 = new LanguageTagParser(); 1588 // LanguageTagParser p2 = new LanguageTagParser(); 1589 // public int compare(String o1, String o2) { 1590 // if (o1.equals(o2)) return 0; 1591 // p1.set(o1); 1592 // p2.set(o2); 1593 // String lang1 = p1.getLanguage(); 1594 // String lang2 = p2.getLanguage(); 1595 // 1596 // // compare languages first 1597 // // put und at the end 1598 // int result = lang1.compareTo(lang2); 1599 // if (result != 0) { 1600 // if (lang1.equals("und")) return 1; 1601 // if (lang2.equals("und")) return -1; 1602 // return result; 1603 // } 1604 // 1605 // // now scripts and regions. 1606 // // if they have different numbers of fields, the shorter wins. 1607 // // If there are two fields, region is lowest. 1608 // // The simplest way is to just compare scripts first 1609 // // so zh-TW < zh-Hant, because we first compare "" to Hant 1610 // String script1 = p1.getScript(); 1611 // String script2 = p2.getScript(); 1612 // int scriptOrder = script1.compareTo(script2); 1613 // if (scriptOrder != 0) return scriptOrder; 1614 // 1615 // String region1 = p1.getRegion(); 1616 // String region2 = p2.getRegion(); 1617 // int regionOrder = region1.compareTo(region2); 1618 // if (regionOrder != 0) return regionOrder; 1619 // 1620 // return o1.compareTo(o2); 1621 // } 1622 // 1623 // }; 1624 minimize(Map<String, String> fluffup)1625 public static void minimize(Map<String, String> fluffup) { 1626 LanguageTagParser parser = new LanguageTagParser(); 1627 LanguageTagParser targetParser = new LanguageTagParser(); 1628 Set<String> removals = new TreeSet<>(); 1629 while (true) { 1630 removals.clear(); 1631 for (String locale : fluffup.keySet()) { 1632 String target = fluffup.get(locale); 1633 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) { 1634 removals.add(locale); 1635 if (SHOW_ADD) 1636 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1637 + "\t\t - Unknown Region in target"); 1638 continue; 1639 } 1640 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) { 1641 removals.add(locale); 1642 if (SHOW_ADD) 1643 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1644 + "\t\t - Unknown Script in target"); 1645 continue; 1646 } 1647 1648 String region = parser.set(locale).getRegion(); 1649 if (region.length() != 0) { 1650 if (region.equals(UNKNOWN_REGION)) { 1651 removals.add(locale); 1652 if (SHOW_ADD) 1653 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1654 + "\t\t - Unknown Region in source"); 1655 continue; 1656 } 1657 parser.setRegion(""); 1658 String newLocale = parser.toString(); 1659 String newTarget = fluffup.get(newLocale); 1660 if (newTarget != null) { 1661 newTarget = targetParser.set(newTarget).setRegion(region).toString(); 1662 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1663 removals.add(locale); 1664 if (SHOW_ADD) 1665 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1666 + newLocale); 1667 continue; 1668 } 1669 } 1670 } 1671 String script = parser.set(locale).getScript(); 1672 if (locale.equals(DEBUG_ADD_KEY)) { 1673 System.out.println("*debug*"); 1674 } 1675 if (script.length() != 0) { 1676 if (script.equals(UNKNOWN_SCRIPT)) { 1677 removals.add(locale); 1678 if (SHOW_ADD) 1679 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script"); 1680 continue; 1681 } 1682 parser.setScript(""); 1683 String newLocale = parser.toString(); 1684 String newTarget = fluffup.get(newLocale); 1685 if (newTarget != null) { 1686 newTarget = targetParser.set(newTarget).setScript(script).toString(); 1687 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1688 removals.add(locale); 1689 if (SHOW_ADD) 1690 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1691 + newLocale); 1692 continue; 1693 } 1694 } 1695 } 1696 } 1697 if (removals.size() == 0) { 1698 break; 1699 } 1700 for (String locale : removals) { 1701 fluffup.remove(locale); 1702 } 1703 } 1704 } 1705 1706 // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) { 1707 // // add script 1708 // Map<String, String> temp = new TreeMap<String, String>(); 1709 // while (true) { 1710 // temp.clear(); 1711 // for (String target : new TreeSet<String>(fluffup.values())) { 1712 // parser.set(target); 1713 // final String territory = parser.getRegion(); 1714 // if (territory.length() == 0) { 1715 // continue; 1716 // } 1717 // parser.setRegion(""); 1718 // String possibleSource = parser.toString(); 1719 // if (fluffup.containsKey(possibleSource)) { 1720 // continue; 1721 // } 1722 // String other = temp.get(possibleSource); 1723 // if (other != null) { 1724 // if (!target.equals(other)) { 1725 // System.out.println("**Failure with multiple sources in addLanguageScript: " 1726 // + possibleSource + "\t=>\t" + target + ", " + other); 1727 // } 1728 // continue; 1729 // } 1730 // temp.put(possibleSource, target); 1731 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script"); 1732 // } 1733 // if (temp.size() == 0) { 1734 // break; 1735 // } 1736 // fluffup.putAll(temp); 1737 // } 1738 // 1739 // } 1740 1741 // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) { 1742 // // add script 1743 // Map<String, String> temp = new TreeMap<String, String>(); 1744 // while (true) { 1745 // temp.clear(); 1746 // for (String target : new TreeSet<String>(fluffup.values())) { 1747 // parser.set(target); 1748 // String script = parser.getScript(); 1749 // if (script.length() == 0) { 1750 // continue; 1751 // } 1752 // parser.setScript(""); 1753 // String possibleSource = parser.toString(); 1754 // if (fluffup.containsKey(possibleSource)) { 1755 // continue; 1756 // } 1757 // String other = temp.get(possibleSource); 1758 // 1759 // if (other != null) { 1760 // if (!target.equals(other)) { 1761 // script = getScriptForLocale(possibleSource); 1762 // if (script == null) { 1763 // System.out.println("**Failure with multiple sources in addLanguageCountry: " 1764 // + possibleSource + "\t=>\t" + target + ", " + other); 1765 // continue; // error message in routine 1766 // } 1767 // parser.setScript(script); 1768 // target = parser.toString(); 1769 // } 1770 // } 1771 // 1772 // temp.put(possibleSource, target); 1773 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry"); 1774 // } 1775 // if (temp.size() == 0) { 1776 // break; 1777 // } 1778 // fluffup.putAll(temp); 1779 // } 1780 // 1781 // } 1782 1783 // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) { 1784 // // add script 1785 // Map<String, String> temp = new TreeMap<String, String>(); 1786 // while (true) { 1787 // temp.clear(); 1788 // Set skipTarget = fluffup.keySet(); 1789 // for (String locale : fluffup.keySet()) { 1790 // String target = fluffup.get(locale); 1791 // parser.set(target); 1792 // if (parser.getScript().length() != 0) { 1793 // continue; 1794 // } 1795 // String script = getScriptForLocale(target); 1796 // 1797 // if (script == null) { 1798 // continue; // error message in routine 1799 // } 1800 // parser.setScript(script); 1801 // String furtherTarget = parser.toString(); 1802 // addIfNotIn(target, furtherTarget, temp, fluffup, "Script"); 1803 // } 1804 // if (temp.size() == 0) { 1805 // break; 1806 // } 1807 // fluffup.putAll(temp); 1808 // } 1809 // } 1810 1811 // private static String getScriptForLocale(String locale) { 1812 // String result = getScriptForLocale2(locale); 1813 // if (result != null) return result; 1814 // int pos = locale.indexOf('_'); 1815 // if (pos >= 0) { 1816 // result = getScriptForLocale2(locale.substring(0,pos)); 1817 // } 1818 // return result; 1819 // } 1820 1821 private static String UNKNOWN_SCRIPT = "Zzzz"; 1822 private static String UNKNOWN_REGION = "ZZ"; 1823 getScriptForLocale2(String locale)1824 private static String getScriptForLocale2(String locale) { 1825 String result = localeToScriptCache.get(locale); 1826 if (result != null) { 1827 return result; 1828 } 1829 if (locale.equals("ky")) { 1830 int debug = 0; 1831 } 1832 try { 1833 Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale); 1834 if (data != null) { 1835 for (BasicLanguageData datum : data.values()) { 1836 final Set<String> scripts = datum.getScripts(); 1837 boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary; 1838 if (scripts.size() != 1) { 1839 if (scripts.size() > 1 && isPrimary) { 1840 break; 1841 } 1842 continue; 1843 } 1844 String script = scripts.iterator().next(); 1845 if (isPrimary) { 1846 return result = script; 1847 } else if (result == null) { 1848 result = script; 1849 } 1850 } 1851 if (result != null) { 1852 return result; 1853 } 1854 } 1855 CLDRFile cldrFile; 1856 try { 1857 cldrFile = factory.make(locale, true); 1858 } catch (RuntimeException e) { 1859 result = FALLBACK_SCRIPTS.get(locale); 1860 if (result == null) { 1861 System.err.println("***Failed to find script in L-S-R or MAX_ADDITIONS for: " + locale + "\t" + english.getName(locale)); 1862 return result = UNKNOWN_SCRIPT; 1863 } else { 1864 return result; 1865 } 1866 } 1867 UnicodeSet exemplars = getExemplarSet(cldrFile, ""); 1868 Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars); 1869 CLDRScripts.remove(UNKNOWN_SCRIPT); 1870 if (CLDRScripts.size() == 1) { 1871 return result = CLDRScripts.iterator().next(); 1872 } else if (CLDRScripts.size() == 0) { 1873 System.out.println("**Failed to get script for:\t" + locale); 1874 return result = UNKNOWN_SCRIPT; 1875 } else { 1876 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts); 1877 return result = UNKNOWN_SCRIPT; 1878 } 1879 } finally { 1880 if (result.equals(UNKNOWN_SCRIPT)) { 1881 String temp = LANGUAGE_OVERRIDES.get(locale); 1882 if (temp != null) { 1883 result = new LanguageTagParser().set(temp).getScript(); 1884 System.err.println("***Warning, Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result); 1885 } 1886 } 1887 localeToScriptCache.put(locale, result); 1888 if (SHOW_ADD) 1889 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t" 1890 + english.getName(CLDRFile.SCRIPT_NAME, result)); 1891 } 1892 } 1893 1894 // private static Map<String, String> closeMapping(Map<String, String> fluffup) { 1895 // if (SHOW_ADD) System.out.flush(); 1896 // Map<String,String> temp = new TreeMap<String,String>(); 1897 // while (true) { 1898 // temp.clear(); 1899 // for (String locale : fluffup.keySet()) { 1900 // String target = fluffup.get(locale); 1901 // if (target.equals("si_Sinh") || target.equals("zh-Hani")) { 1902 // System.out.println("????"); 1903 // } 1904 // String furtherTarget = fluffup.get(target); 1905 // if (furtherTarget == null) { 1906 // continue; 1907 // } 1908 // addIfNotIn(locale, furtherTarget, temp, null, "Close"); 1909 // } 1910 // if (temp.size() == 0) { 1911 // break; 1912 // } 1913 // fluffup.putAll(temp); 1914 // } 1915 // if (SHOW_ADD) System.out.flush(); 1916 // return temp; 1917 // } 1918 getScriptsFromUnicodeSet(UnicodeSet exemplars)1919 public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) { 1920 // use bits first, since that's faster 1921 BitSet scriptBits = new BitSet(); 1922 boolean show = false; 1923 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { 1924 if (show) 1925 System.out.println(Integer.toHexString(it.codepoint)); 1926 if (it.codepoint != UnicodeSetIterator.IS_STRING) { 1927 scriptBits.set(UScript.getScript(it.codepoint)); 1928 } else { 1929 int cp; 1930 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) { 1931 scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i))); 1932 } 1933 } 1934 } 1935 scriptBits.clear(UScript.COMMON); 1936 scriptBits.clear(UScript.INHERITED); 1937 Set<String> scripts = new TreeSet<>(); 1938 for (int j = 0; j < scriptBits.size(); ++j) { 1939 if (scriptBits.get(j)) { 1940 scripts.add(UScript.getShortName(j)); 1941 } 1942 } 1943 return scripts; 1944 } 1945 getExemplarSet(CLDRFile cldrfile, String type)1946 public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) { 1947 if (type.length() != 0) 1948 type = "[@type=\"" + type + "\"]"; 1949 String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" 1950 + type); 1951 if (v == null) 1952 return new UnicodeSet(); 1953 return new UnicodeSet(v); 1954 } 1955 1956 // private static String[][] SpecialCases = { 1957 // { "zh_Hani", "zh_Hans_CN"}, 1958 // { "si_Sinh", "si_Sinh_LK"}, 1959 // { "ii", "ii_CN"}, // Sichuan Yi (Yi) 1960 // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics) 1961 // { "und", "en"}, // English default 1962 // }; 1963 showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)1964 static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) { 1965 Set<String> errors = new LinkedHashSet<>(); 1966 Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents( 1967 ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors); 1968 if (!errors.isEmpty()) { 1969 System.out.println(Joiner.on("\n").join(errors)); 1970 errors.clear(); 1971 } 1972 Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 1973 new TreeMap<String, String>(), errors); 1974 if (!errors.isEmpty()) { 1975 System.out.println("Default Content errors: " + Joiner.on("\n").join(errors)); 1976 errors.clear(); 1977 } 1978 Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent, 1979 "ar", "ar_001"); 1980 System.out.println(Joiner.on("\n").join(changes)); 1981 defaultLocaleContent.clear(); 1982 defaultLocaleContent.addAll(newDefaultContent.values()); 1983 newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 1984 new TreeMap<String, String>(), errors); 1985 if (!errors.isEmpty()) { 1986 System.out.println("***New Errors: " + Joiner.on("\n").join(errors)); 1987 } 1988 } 1989 compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)1990 private static Set<String> compareMapsAndFixNew(String title, 1991 Map<String, String> oldContent, 1992 Map<String, String> newContent, String... allowedOverrideValues) { 1993 Map<String, String> allowedOverrideValuesTest = new HashMap<>(); 1994 for (int i = 0; i < allowedOverrideValues.length; i += 2) { 1995 allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]); 1996 } 1997 Set<String> changes = new TreeSet<>(); 1998 for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet()) 1999 .addAll(oldContent.keySet()).get()) { 2000 String oldValue = oldContent.get(parent); 2001 String newValue = newContent.get(parent); 2002 String overrideValue = allowedOverrideValuesTest.get(parent); 2003 if (overrideValue != null) { 2004 newContent.put(parent, overrideValue); 2005 newValue = overrideValue; 2006 } 2007 if (CldrUtility.equals(oldValue, newValue)) { 2008 continue; 2009 } 2010 String message; 2011 if (oldValue == null) { 2012 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2013 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2014 newContent.put(parent, newValue); 2015 } else if (newValue == null) { 2016 if (SUPPRESS_CHANGES) { 2017 message = "Suppressing removal of " 2018 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2019 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2020 newContent.put(parent, oldValue); 2021 } else { 2022 message = "Removing " 2023 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2024 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2025 newContent.remove(oldValue); 2026 } 2027 } else { 2028 if (SUPPRESS_CHANGES) { 2029 message = "Suppressing change of " 2030 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2031 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2032 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2033 newContent.remove(newValue); 2034 newContent.put(parent, oldValue); 2035 } else { 2036 message = "Changing " 2037 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2038 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2039 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2040 newContent.remove(oldValue); 2041 newContent.put(parent, newValue); 2042 } 2043 } 2044 changes.add(title + message); 2045 } 2046 return changes; 2047 } 2048 2049 public static class LocaleStringComparator implements Comparator<String> { 2050 LanguageTagParser ltp0 = new LanguageTagParser(); 2051 LanguageTagParser ltp1 = new LanguageTagParser(); 2052 2053 @Override compare(String arg0, String arg1)2054 public int compare(String arg0, String arg1) { 2055 ltp0.set(arg0); 2056 ltp1.set(arg1); 2057 String s0 = ltp0.getLanguage(); 2058 String s1 = ltp1.getLanguage(); 2059 int result = s0.compareTo(s1); 2060 if (result != 0) { 2061 return s0.equals("und") ? 1 2062 : s1.equals("und") ? -1 2063 : result; 2064 } 2065 s0 = ltp0.getScript(); 2066 s1 = ltp1.getScript(); 2067 result = s0.compareTo(s1); 2068 if (result != 0) { 2069 return result; 2070 } 2071 s0 = ltp0.getRegion(); 2072 s1 = ltp1.getRegion(); 2073 result = s0.compareTo(s1); 2074 if (result != 0) { 2075 return result; 2076 } 2077 return arg0.compareTo(arg1); // just in case 2078 } 2079 2080 } 2081 } 2082