1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.util.Arrays; 8 import java.util.BitSet; 9 import java.util.Collection; 10 import java.util.Comparator; 11 import java.util.HashMap; 12 import java.util.HashSet; 13 import java.util.LinkedHashSet; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Map.Entry; 17 import java.util.Set; 18 import java.util.TreeMap; 19 import java.util.TreeSet; 20 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.draft.ScriptMetadata; 23 import org.unicode.cldr.draft.ScriptMetadata.Info; 24 import org.unicode.cldr.util.Builder; 25 import org.unicode.cldr.util.CLDRConfig; 26 import org.unicode.cldr.util.CLDRFile; 27 import org.unicode.cldr.util.CLDRLocale; 28 import org.unicode.cldr.util.CLDRPaths; 29 import org.unicode.cldr.util.CldrUtility; 30 import org.unicode.cldr.util.Containment; 31 import org.unicode.cldr.util.Counter; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.Iso639Data; 34 import org.unicode.cldr.util.Iso639Data.Scope; 35 import org.unicode.cldr.util.LanguageTagParser; 36 import org.unicode.cldr.util.LocaleIDParser; 37 import org.unicode.cldr.util.Log; 38 import org.unicode.cldr.util.Organization; 39 import org.unicode.cldr.util.PatternCache; 40 import org.unicode.cldr.util.SimpleFactory; 41 import org.unicode.cldr.util.StandardCodes; 42 import org.unicode.cldr.util.StandardCodes.LstrType; 43 import org.unicode.cldr.util.SupplementalDataInfo; 44 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 45 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 46 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 47 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 48 import org.unicode.cldr.util.Validity; 49 import org.unicode.cldr.util.Validity.Status; 50 51 import com.google.common.base.Joiner; 52 import com.google.common.collect.ImmutableList; 53 import com.google.common.collect.ImmutableMap; 54 import com.google.common.collect.ImmutableSet; 55 import com.ibm.icu.impl.Relation; 56 import com.ibm.icu.impl.Row; 57 import com.ibm.icu.impl.Row.R2; 58 import com.ibm.icu.impl.Row.R3; 59 import com.ibm.icu.impl.Row.R4; 60 import com.ibm.icu.lang.UScript; 61 import com.ibm.icu.text.Collator; 62 import com.ibm.icu.text.NumberFormat; 63 import com.ibm.icu.text.UTF16; 64 import com.ibm.icu.text.UnicodeSet; 65 import com.ibm.icu.text.UnicodeSetIterator; 66 import com.ibm.icu.util.ULocale; 67 68 /** 69 * Problems: 70 * "und_Hani", "zh_Hani" 71 * "und_Sinh", "si_Sinh" 72 * 73 * @author markdavis 74 * 75 */ 76 public class GenerateMaximalLocales { 77 78 private static final Map<String, Status> LANGUAGE_CODE_TO_STATUS = Validity.getInstance().getCodeToStatus(LstrType.language); 79 80 private static final String TEMP_UNKNOWN_REGION = "XZ"; 81 82 private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; 83 84 private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false); 85 private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); 86 private static final boolean SHOW_CONTAINERS = false; 87 88 enum OutputStyle { 89 PLAINTEXT, C, C_ALT, XML 90 } 91 92 private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML") 93 .toUpperCase()); 94 95 // set based on above 96 private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR 97 : "\t"; 98 private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; 99 // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT; 100 101 private static final boolean tryDifferent = true; 102 103 private static final File list[] = { 104 new File(CLDRPaths.MAIN_DIRECTORY), 105 new File(CLDRPaths.SEED_DIRECTORY), 106 new File(CLDRPaths.EXEMPLARS_DIRECTORY) }; 107 108 private static Factory factory = SimpleFactory.make(list, ".*"); 109 private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory(); 110 private static SupplementalDataInfo supplementalData = SupplementalDataInfo 111 .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); 112 private static StandardCodes standardCodes = StandardCodes.make(); 113 private static CLDRFile english = factory.make("en", false); 114 static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 115 static { 116 for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { 117 String region = locale.getCountry(); 118 if (region == null || region.isEmpty() || Containment.isLeaf(region)) { 119 continue; 120 } cldrContainerToLanguages.put(region, locale.getLanguage())121 cldrContainerToLanguages.put(region, locale.getLanguage()); 122 } cldrContainerToLanguages.freeze()123 cldrContainerToLanguages.freeze(); 124 System.out.println("Keep containers " + cldrContainerToLanguages); 125 } 126 127 private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET"); 128 private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr"); 129 130 /** 131 * This is the simplest way to override, by supplying the max value. 132 * It gets a very low weight, so doesn't override any stronger value. 133 */ 134 private static final String[] MAX_ADDITIONS = new String[] { 135 "bss_Latn_CM", 136 "gez_Ethi_ET", 137 "ken_Latn_CM", 138 "und_Arab_PK", 139 "wa_Latn_BE", 140 141 "fub_Arab_CM", 142 "fuf_Latn_GN", 143 "kby_Arab_NE", 144 "kdh_Latn_TG", 145 "apd_Arab_TG", 146 "zlm_Latn_TG", 147 148 "cr_Cans_CA", 149 "hif_Latn_FJ", 150 "gon_Telu_IN", 151 "lzz_Latn_TR", 152 "lif_Deva_NP", 153 "unx_Beng_IN", 154 "unr_Beng_IN", 155 "ttt_Latn_AZ", 156 "pnt_Grek_GR", 157 "tly_Latn_AZ", 158 "tkr_Latn_AZ", 159 "bsq_Bass_LR", 160 "ccp_Cakm_BD", 161 "blt_Tavt_VN", 162 "rhg_Arab_MM", 163 "rhg_Rohg_MM", 164 165 "no_Latn_NO", 166 "und_Cpmn_CY", 167 }; 168 169 /** 170 * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS. 171 * However, if you add, add both the language and language+script mappings. 172 */ 173 // Many of the overrides below can be removed once the language/pop/country data is updated. 174 private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] { 175 { "cic", "cic_Latn_US" }, 176 { "cic_Latn", "cic_Latn_US" }, 177 { "eo", "eo_Latn_001" }, 178 { "eo_Latn", "eo_Latn_001" }, 179 { "es", "es_Latn_ES" }, 180 { "es_Latn", "es_Latn_ES" }, 181 { "ff_BF", "ff_Latn_BF" }, 182 { "ff_GM", "ff_Latn_GM" }, 183 { "ff_GH", "ff_Latn_GH" }, 184 { "ff_GW", "ff_Latn_GW" }, 185 { "ff_LR", "ff_Latn_LR" }, 186 { "ff_NE", "ff_Latn_NE" }, 187 { "ff_NG", "ff_Latn_NG" }, 188 { "ff_SL", "ff_Latn_SL" }, 189 { "ff_Adlm", "ff_Adlm_GN" }, 190 { "ia", "ia_Latn_001" }, 191 { "ia_Latn", "ia_Latn_001" }, 192 { "io", "io_Latn_001" }, 193 { "io_Latn", "io_Latn_001" }, 194 { "jbo", "jbo_Latn_001" }, 195 { "jbo_Latn", "jbo_Latn_001" }, 196 { "ku_Arab", "ku_Arab_IQ" }, 197 { "lrc", "lrc_Arab_IR" }, 198 { "lrc_Arab", "lrc_Arab_IR" }, 199 { "man", "man_Latn_GM" }, 200 { "man_Latn", "man_Latn_GM" }, 201 { "mas", "mas_Latn_KE" }, 202 { "mas_Latn", "mas_Latn_KE" }, 203 { "mn", "mn_Cyrl_MN" }, 204 { "mn_Cyrl", "mn_Cyrl_MN" }, 205 { "mro", "mro_Mroo_BD" }, 206 { "mro_BD", "mro_Mroo_BD" }, 207 { "ms_Arab", "ms_Arab_MY" }, 208 { "pap", "pap_Latn_AW" }, 209 { "pap_Latn", "pap_Latn_AW" }, 210 { "prg", "prg_Latn_001" }, 211 { "prg_Latn", "prg_Latn_001" }, 212 { "rif", "rif_Tfng_MA" }, 213 { "rif_Latn", "rif_Latn_MA" }, 214 { "rif_Tfng", "rif_Tfng_MA" }, 215 { "rif_MA", "rif_Tfng_MA" }, 216 { "shi", "shi_Tfng_MA" }, 217 { "shi_Tfng", "shi_Tfng_MA" }, 218 { "shi_MA", "shi_Tfng_MA" }, 219 { "sr_Latn", "sr_Latn_RS" }, 220 { "ss", "ss_Latn_ZA" }, 221 { "ss_Latn", "ss_Latn_ZA" }, 222 { "swc", "swc_Latn_CD" }, 223 { "ti", "ti_Ethi_ET" }, 224 { "ti_Ethi", "ti_Ethi_ET" }, 225 { "und", "en_Latn_US" }, 226 { "und_Adlm", "ff_Adlm_GN" }, 227 { "und_Adlm_GN", "ff_Adlm_GN" }, 228 { "und_Arab", "ar_Arab_EG" }, 229 { "und_Arab_PK", "ur_Arab_PK" }, 230 { "und_Bopo", "zh_Bopo_TW" }, 231 { "und_Deva_FJ", "hif_Deva_FJ" }, 232 { "und_EZ", "de_Latn_EZ" }, 233 { "und_Hani", "zh_Hani_CN" }, 234 { "und_Hani_CN", "zh_Hani_CN" }, 235 { "und_Kana", "ja_Kana_JP" }, 236 { "und_Kana_JP", "ja_Kana_JP" }, 237 { "und_Latn", "en_Latn_US" }, 238 { "und_Latn_ET", "en_Latn_ET" }, 239 { "und_Latn_NE", "ha_Latn_NE" }, 240 { "und_Latn_PH", "fil_Latn_PH" }, 241 { "und_ML", "bm_Latn_ML" }, 242 { "und_Latn_ML", "bm_Latn_ML" }, 243 { "und_MU", "mfe_Latn_MU" }, 244 { "und_NE", "ha_Latn_NE" }, 245 { "und_PH", "fil_Latn_PH" }, 246 { "und_PK", "ur_Arab_PK" }, 247 { "und_SO", "so_Latn_SO" }, 248 { "und_SS", "en_Latn_SS" }, 249 { "und_TK", "tkl_Latn_TK" }, 250 { "und_UN", "en_Latn_UN" }, 251 { "und_005", "pt_Latn_BR" }, 252 { "vo", "vo_Latn_001" }, 253 { "vo_Latn", "vo_Latn_001" }, 254 { "yi", "yi_Hebr_001" }, 255 { "yi_Hebr", "yi_Hebr_001" }, 256 { "yue", "yue_Hant_HK" }, 257 { "yue_Hant", "yue_Hant_HK" }, 258 { "yue_Hans", "yue_Hans_CN" }, 259 { "yue_CN", "yue_Hans_CN" }, 260 { "zh_Hani", "zh_Hani_CN" }, 261 262 { "zh_Bopo", "zh_Bopo_TW" }, 263 { "ccp", "ccp_Cakm_BD" }, 264 { "ccp_Cakm", "ccp_Cakm_BD" }, 265 { "und_Cakm", "ccp_Cakm_BD" }, 266 { "cu_Glag", "cu_Glag_BG" }, 267 { "sd_Khoj", "sd_Khoj_IN" }, 268 { "lif_Limb", "lif_Limb_IN" }, 269 { "grc_Linb", "grc_Linb_GR" }, 270 { "arc_Nbat", "arc_Nbat_JO" }, 271 { "arc_Palm", "arc_Palm_SY" }, 272 { "pal_Phlp", "pal_Phlp_CN" }, 273 { "en_Shaw", "en_Shaw_GB" }, 274 { "sd_Sind", "sd_Sind_IN" }, 275 { "und_Brai", "fr_Brai_FR" }, // hack 276 { "und_Hanb", "zh_Hanb_TW" }, // Special script code 277 { "zh_Hanb", "zh_Hanb_TW" }, // Special script code 278 { "und_Jamo", "ko_Jamo_KR" }, // Special script code 279 280 //{"und_Cyrl_PL", "be_Cyrl_PL"}, 281 282 // {"cr", "cr_Cans_CA"}, 283 // {"hif", "hif_Latn_FJ"}, 284 // {"gon", "gon_Telu_IN"}, 285 // {"lzz", "lzz_Latn_TR"}, 286 // {"lif", "lif_Deva_NP"}, 287 // {"unx", "unx_Beng_IN"}, 288 // {"unr", "unr_Beng_IN"}, 289 // {"ttt", "ttt_Latn_AZ"}, 290 // {"pnt", "pnt_Grek_GR"}, 291 // {"tly", "tly_Latn_AZ"}, 292 // {"tkr", "tkr_Latn_AZ"}, 293 // {"bsq", "bsq_Bass_LR"}, 294 // {"ccp", "ccp_Cakm_BD"}, 295 // {"blt", "blt_Tavt_VN"}, 296 // { "mis_Medf", "mis_Medf_NG" }, 297 298 { "ku_Yezi", "ku_Yezi_GE" }, 299 { "und_EU", "en_Latn_IE" }, 300 }); 301 302 /** 303 * The following supplements the suppress-script. It overrides info from exemplars and the locale info. 304 */ 305 private static String[][] SpecialScripts = { 306 { "zh", "Hans" }, // Hans (not Hani) 307 { "yue", "Hant" }, // Hans (not Hani) 308 { "chk", "Latn" }, // Chuukese (Micronesia) 309 { "fil", "Latn" }, // Filipino (Philippines)" 310 { "ko", "Kore" }, // Korean (North Korea) 311 { "ko_KR", "Kore" }, // Korean (North Korea) 312 { "pap", "Latn" }, // Papiamento (Netherlands Antilles) 313 { "pau", "Latn" }, // Palauan (Palau) 314 { "su", "Latn" }, // Sundanese (Indonesia) 315 { "tet", "Latn" }, // Tetum (East Timor) 316 { "tk", "Latn" }, // Turkmen (Turkmenistan) 317 { "ty", "Latn" }, // Tahitian (French Polynesia) 318 { "ja", "Jpan" }, // Special script for japan 319 { "und", "Latn" }, // Ultimate fallback 320 }; 321 322 private static Map<String, String> localeToScriptCache = new TreeMap<>(); 323 static { 324 for (String language : standardCodes.getAvailableCodes("language")) { 325 Map<String, String> info = standardCodes.getLangData("language", language); 326 String script = info.get("Suppress-Script"); 327 if (script != null) { localeToScriptCache.put(language, script)328 localeToScriptCache.put(language, script); 329 } 330 } 331 for (String[] pair : SpecialScripts) { localeToScriptCache.put(pair[0], pair[1])332 localeToScriptCache.put(pair[0], pair[1]); 333 } 334 } 335 336 private static Map<String, String> FALLBACK_SCRIPTS; 337 static { 338 LanguageTagParser additionLtp = new LanguageTagParser(); 339 Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>(); 340 for (String addition : MAX_ADDITIONS) { 341 additionLtp.set(addition); 342 String lan = additionLtp.getLanguage(); _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())343 _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript()); 344 } 345 FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS); 346 } 347 348 private static int errorCount; 349 main(String[] args)350 public static void main(String[] args) throws IOException { 351 352 printDefaultLanguagesAndScripts(); 353 354 Map<String, String> toMaximized = new TreeMap<>(); 355 356 tryDifferentAlgorithm(toMaximized); 357 358 minimize(toMaximized); 359 360 // HACK TEMP_UNKNOWN_REGION 361 // this is to get around the removal of items with ZZ in minimize. 362 // probably cleaner way to do it, but this provides control over just those we want to retain. 363 Set<String> toRemove = new TreeSet<>(); 364 Map<String, String> toFix = new TreeMap<>(); 365 for (Entry<String, String> entry : toMaximized.entrySet()) { 366 String key = entry.getKey(); 367 String value = entry.getValue(); 368 if (key.contains(TEMP_UNKNOWN_REGION)) { 369 toRemove.add(key); 370 } else if (value.contains(TEMP_UNKNOWN_REGION)) { 371 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION)); 372 } 373 } 374 for (String key : toRemove) { 375 toMaximized.remove(key); 376 } 377 toMaximized.putAll(toFix); 378 379 Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags(); 380 Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab", 381 "ms_Arab_ID"); 382 System.out.println(Joiner.on("\n").join(changes)); 383 384 if (OUTPUT_STYLE == OutputStyle.C_ALT) { 385 doAlt(toMaximized); 386 } 387 388 if (SHOW_ADD) 389 System.out 390 .println("/*" 391 + CldrUtility.LINE_SEPARATOR 392 + " To Maximize:" 393 + 394 CldrUtility.LINE_SEPARATOR 395 + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing." 396 + 397 CldrUtility.LINE_SEPARATOR 398 + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'." 399 + 400 CldrUtility.LINE_SEPARATOR 401 + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions" 402 + 403 CldrUtility.LINE_SEPARATOR 404 + " Try each of the following in order (where the field exists)" 405 + 406 CldrUtility.LINE_SEPARATOR 407 + " Lookup language-script-region. If in the table, return the result + variants" 408 + 409 CldrUtility.LINE_SEPARATOR 410 + " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants" 411 + 412 CldrUtility.LINE_SEPARATOR 413 + " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants" 414 + 415 CldrUtility.LINE_SEPARATOR 416 + " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants" 417 + 418 CldrUtility.LINE_SEPARATOR 419 + 420 CldrUtility.LINE_SEPARATOR 421 + " Example: Input is zh-ZZZZ-SG." 422 + 423 CldrUtility.LINE_SEPARATOR 424 + " Normalize to zh-SG. Lookup in table. No match." 425 + 426 CldrUtility.LINE_SEPARATOR 427 + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG." 428 + 429 CldrUtility.LINE_SEPARATOR 430 + 431 CldrUtility.LINE_SEPARATOR 432 + " To Minimize:" 433 + 434 CldrUtility.LINE_SEPARATOR 435 + " First get max = maximize(input)." 436 + 437 CldrUtility.LINE_SEPARATOR 438 + " Then for trial in {language, language-region, language-script}" 439 + 440 CldrUtility.LINE_SEPARATOR 441 + " If maximize(trial) == max, then return trial." 442 + 443 CldrUtility.LINE_SEPARATOR 444 + " If you don't get a match, return max." 445 + 446 CldrUtility.LINE_SEPARATOR 447 + 448 CldrUtility.LINE_SEPARATOR 449 + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW." 450 + 451 CldrUtility.LINE_SEPARATOR 452 + " zh => zh-Hans-CN. No match, so continue." 453 + 454 CldrUtility.LINE_SEPARATOR 455 + " zh-TW => zh-Hans-TW. Match, so return zh-TW." 456 + 457 CldrUtility.LINE_SEPARATOR 458 + 459 CldrUtility.LINE_SEPARATOR 460 + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language." 461 + 462 CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() + 463 CldrUtility.LINE_SEPARATOR + "*/"); 464 465 printLikelySubtags(toMaximized); 466 467 // if (OUTPUT_STYLE != OutputStyle.XML) { 468 // printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null); 469 // } 470 471 printDefaultContent(toMaximized); 472 473 System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR); 474 475 } 476 477 static class RowData implements Comparable<RowData> { 478 OfficialStatus os; 479 String name; 480 Long pop; 481 RowData(OfficialStatus os, String name, Long pop)482 public RowData(OfficialStatus os, String name, Long pop) { 483 this.os = os; 484 this.name = name; 485 this.pop = pop; 486 } 487 getStatus()488 public OfficialStatus getStatus() { 489 // TODO Auto-generated method stub 490 return os; 491 } 492 getName()493 public CharSequence getName() { 494 // TODO Auto-generated method stub 495 return name; 496 } 497 getLiteratePopulation()498 public Long getLiteratePopulation() { 499 // TODO Auto-generated method stub 500 return pop; 501 } 502 503 @Override compareTo(RowData o)504 public int compareTo(RowData o) { 505 // TODO Auto-generated method stub 506 int result = os.compareTo(o.os); 507 if (result != 0) return -result; 508 long result2 = pop - o.pop; 509 if (result2 != 0) return result2 < 0 ? 1 : -1; 510 return name.compareTo(o.name); 511 } 512 513 @Override equals(Object o)514 public boolean equals(Object o) { 515 return 0 == compareTo((RowData) o); 516 } 517 518 @Override hashCode()519 public int hashCode() { 520 throw new UnsupportedOperationException(); 521 } 522 } 523 printDefaultLanguagesAndScripts()524 private static void printDefaultLanguagesAndScripts() { 525 526 final int minTotalPopulation = 10000000; 527 final int minTerritoryPopulation = 1000000; 528 final double minTerritoryPercent = 1.0 / 3; 529 Map<String, Set<RowData>> languageToReason = new TreeMap<>(); 530 Counter<String> languageToLiteratePopulation = new Counter<>(); 531 NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH); 532 nf.setGroupingUsed(true); 533 LanguageTagParser ltp = new LanguageTagParser(); 534 LikelySubtags likelySubtags = new LikelySubtags(); 535 /* 536 * A. X is a qualified language**, and at least one of the following is true: 537 * 538 * 1. X is has official status* in any country 539 * 2. X exceeds a threshold population† of literate users worldwide: 1M 540 * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†. 541 * 542 * B. X is an exception explicitly approved by the committee or X has minimal 543 * language coverage‡ in CLDR itself. 544 * C. The language is in the CLDR-target locales 545 */ 546 OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official; 547 Map<String, String> languages = new TreeMap<>(); 548 for (String language : standardCodes.getAvailableCodes("language")) { 549 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 550 String result = english.getStringValue(path); 551 if (result != null) { 552 languages.put(language, result); 553 } 554 } 555 for (String language : languages.keySet()) { 556 System.out.println(language + "\t" + languages.get(language)); 557 } 558 559 // also CLDR-target locales 560 final Set<String> CLDRMainLanguages = new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr)); 561 562 for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 563 PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory); 564 double territoryPopulation = territoryPop.getLiteratePopulation(); 565 for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) { 566 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript, 567 territory); 568 ltp.set(languageScript); 569 String language = ltp.getLanguage(); 570 // if (ltp.getScript().isEmpty()) { 571 // String max = likelySubtags.maximize(languageScript); 572 // if (max != null) { 573 // ltp.set(max).setRegion(""); 574 // languageScript = ltp.toString(); 575 // } 576 // } 577 boolean add = false; 578 // #1 579 OfficialStatus status = popData.getOfficialStatus(); 580 if (status.compareTo(minimalStatus) >= 0) { 581 add = true; 582 } 583 long literatePopulation = getWritingPopulation(popData); 584 // #2 585 languageToLiteratePopulation.add(language, literatePopulation); 586 // #3 587 if (literatePopulation > minTerritoryPopulation 588 && literatePopulation > minTerritoryPercent * territoryPopulation) { 589 add = true; 590 } 591 if (add == false && CLDRMainLanguages.contains(language)) { 592 add = true; 593 } 594 if (add) { 595 add(languageToReason, language, territory, status, literatePopulation); 596 // Add the containing regions 597 for (String container : Containment.leafToContainer(territory)) { 598 add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation); 599 } 600 } 601 } 602 } 603 // #2, now that we have the data 604 for (String language : languageToLiteratePopulation.keySet()) { 605 long totalPop = languageToLiteratePopulation.getCount(language); 606 if (totalPop > minTotalPopulation) { 607 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop); 608 } 609 } 610 611 // Specials 612 add(languageToReason, "und", "001", OfficialStatus.unknown, 0); 613 614 // for (String language : Iso639Data.getAvailable()) { 615 // Scope scope = Iso639Data.getScope(language); 616 // Type type = Iso639Data.getType(language); 617 // if (scope == Scope.Special) { 618 // add(languageToReason, language, "001", OfficialStatus.unknown, -1); 619 // } 620 // } 621 // print them 622 623 System.out.println("Detailed - Including:\t" + languageToReason.size()); 624 625 for (String language : languageToReason.keySet()) { 626 Set<RowData> reasons = languageToReason.get(language); 627 628 RowData lastReason = reasons.iterator().next(); 629 630 System.out.append(language) 631 .append("\t") 632 .append(english.getName(language)) 633 .append("\t") 634 .append(lastReason.getStatus().toShortString()) 635 .append("\t") 636 .append(nf.format(languageToLiteratePopulation.getCount(language))); 637 for (RowData reason : reasons) { 638 String status = reason.getStatus().toShortString(); 639 System.out.append("\t") 640 .append(status) 641 .append("-") 642 .append(reason.getName()) 643 .append("-") 644 .append(nf.format(reason.getLiteratePopulation())); 645 } 646 System.out.append("\n"); 647 } 648 649 // now list them 650 651 Set<String> others = new TreeSet<>(); 652 others.addAll(standardCodes.getGoodAvailableCodes("language")); 653 others.removeAll(languageToReason.keySet()); 654 System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size()); 655 showLanguages(languageToReason.keySet(), languageToReason); 656 System.out.println("\nExcluded Languages:\t" + others.size()); 657 showLanguages(others, languageToReason); 658 } 659 getWritingPopulation(PopulationData popData)660 private static long getWritingPopulation(PopulationData popData) { 661 final double writingPopulation = popData.getWritingPopulation(); 662 if (!Double.isNaN(writingPopulation)) { 663 return (long) writingPopulation; 664 } 665 return (long) popData.getLiteratePopulation(); 666 } 667 showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)668 private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) { 669 Set<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH)); 670 for (String language : others) { 671 sorted.add(getLanguageName(language, languageToReason)); 672 } 673 char last = 0; 674 for (String language : sorted) { 675 final char curr = language.charAt(0); 676 if (last != curr) { 677 System.out.println(); 678 } else if (last != '\u0000') { 679 System.out.print(", "); 680 } 681 System.out.print(language); 682 last = curr; 683 } 684 System.out.println(); 685 } 686 getLanguageName(String language, Map<String, Set<RowData>> languageToReason)687 private static String getLanguageName(String language, 688 Map<String, Set<RowData>> languageToReason) { 689 OfficialStatus best = OfficialStatus.unknown; 690 Set<RowData> reasons = languageToReason.get(language); 691 if (reasons != null) { 692 for (RowData reason : reasons) { 693 final OfficialStatus currentStatus = reason.getStatus(); 694 if (best.compareTo(currentStatus) < 0) { 695 best = currentStatus; 696 } 697 } 698 } 699 String status = best.toShortString(); 700 Scope scope = Iso639Data.getScope(language); 701 if (scope == Scope.Special) { 702 status = "S"; 703 } 704 String languageFormatted = english.getName(language) + " [" + language + "]-" + status; 705 return languageFormatted; 706 } 707 add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)708 private static void add(Map<String, Set<RowData>> languageToReason, String language, 709 String territoryRaw, OfficialStatus status, long population) { 710 String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]"; 711 Set<RowData> set = languageToReason.get(language); 712 if (set == null) { 713 languageToReason.put(language, set = new TreeSet<>()); 714 } 715 set.add(new RowData(status, territory, population)); 716 } 717 718 /** 719 * In computing the defaultContents, no and nb require special handling. 720 */ 721 static final Map<String, String> SPECIAL_CHILD_TO_PARENT = ImmutableMap.of("nb", "no", "nb_NO", "nb"); 722 723 /* 724 * Compute the defaultContent values for supplemental data. 725 * It uses the maximization data and the simpleParent (truncation). 726 * We can't use the normal "getParent" because that messes up the logic 727 * used to handle inconsistencies in scripts in CLDR.<br> 728 * That is, there are three situations: <ul> 729 * <li>all children have explicit scripts; </li> 730 * <li>no children have scripts; and </li> 731 * <li>some do and some don't</li></ul> 732 */ 733 printDefaultContent(Map<String, String> toMaximized)734 private static void printDefaultContent(Map<String, String> toMaximized) throws IOException { 735 736 Set<String> defaultLocaleContent = new TreeSet<>(); 737 738 // go through all the cldr locales, and add default contents 739 // now computed from toMaximized 740 Set<String> available = factory.getAvailable(); 741 Relation<String, String> toSimpleChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 742 LanguageTagParser ltp = new LanguageTagParser(); 743 744 // System.out.println(maximize("az_Latn_AZ", toMaximized)); 745 Set<String> hasSimpleChildWithScript = new TreeSet<>(); 746 747 // first get a mapping to children 748 for (String locale : available) { 749 if (locale.equals("root")) { 750 continue; 751 } 752 if (ltp.set(locale).getVariants().size() != 0) { 753 continue; 754 } 755 String parent = SPECIAL_CHILD_TO_PARENT.get(locale); 756 if (parent == null) { 757 parent = LocaleIDParser.getSimpleParent(locale); // we can't use the regular getParent (see above) 758 } 759 760 if (ltp.getScript().length() != 0) { 761 hasSimpleChildWithScript.add(parent); 762 } 763 if (parent.equals("root")) { 764 continue; 765 } 766 toSimpleChildren.put(parent, locale); 767 } 768 769 // Suppress script for locales for which we only have one locale in common/main. See ticket #7834. 770 Set<String> suppressScriptLocales = new HashSet<>(Arrays.asList( 771 "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", 772 "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE", 773 "blt_VN", 774 "hi_IN", 775 "nv_US", 776 "doi_IN" 777 )); 778 779 // if any have a script, then throw out any that don't have a script (unless they're specifically included.) 780 Set<String> toRemove = new TreeSet<>(); 781 for (String locale : hasSimpleChildWithScript) { 782 toRemove.clear(); 783 Set<String> children = toSimpleChildren.getAll(locale); 784 for (String child : children) { 785 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) { 786 toRemove.add(child); 787 } 788 } 789 if (toRemove.size() != 0) { 790 System.out.println("\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children); 791 toSimpleChildren.removeAll(locale, toRemove); 792 } 793 } 794 795 // we add a child as a default locale if it has the same maximization 796 main: for (String locale : toSimpleChildren.keySet()) { 797 String maximized = maximize(locale, toMaximized); 798 if (maximized == null) { 799 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale); 800 continue; 801 } 802 Set<String> children = toSimpleChildren.getAll(locale); 803 Map<String, String> debugStuff = new TreeMap<>(); 804 for (String child : children) { 805 String maximizedChild = maximize(child, toMaximized); 806 if (maximized.equals(maximizedChild)) { 807 defaultLocaleContent.add(child); 808 continue main; 809 } 810 debugStuff.put(child, maximizedChild); 811 } 812 if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized 813 + "\tin\t" + debugStuff); 814 } 815 816 for (String specialChild : SPECIAL_CHILD_TO_PARENT.keySet()) { 817 defaultLocaleContent.add(specialChild); 818 } 819 defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. (old sandbox) 820 defaultLocaleContent.remove("mul_ZZ"); // mul_ZZ isn't ever a real locale. 821 822 showDefaultContentDifferencesAndFix(defaultLocaleContent); 823 824 Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml"); 825 BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml"); 826 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false); 827 828 String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t"; 829 String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep, 830 PatternCache.get("(\\S)\\S*").matcher(""), 80); 831 832 Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 833 Log.println("\t\t/>"); 834 835 // Log.println("</supplementalData>"); 836 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching > 837 CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest 838 839 Log.close(); 840 oldFile.close(); 841 } 842 843 // private static void oldAlgorithm(Map<String,String> toMaximized) { 844 // Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales(); 845 // LanguageTagParser parser = new LanguageTagParser(); 846 // for (String locale : defaultContentLocales) { 847 // String parent = parser.getParent(locale); 848 // toMaximized.put(parent, locale); 849 // if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent"); 850 // } 851 // 852 // for (String[] specialCase : SpecialCases) { 853 // toMaximized.put(specialCase[0], specialCase[1]); 854 // if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial"); 855 // } 856 // 857 // // recurse and close 858 // closeMapping(toMaximized); 859 // 860 // addScript(toMaximized, parser); 861 // 862 // closeMapping(toMaximized); 863 // 864 // addLanguageScript(toMaximized, parser); 865 // 866 // closeMapping(toMaximized); 867 // 868 // addLanguageCountry(toMaximized, parser); 869 // 870 // closeMapping(toMaximized); 871 // 872 // addCountries(toMaximized); 873 // addScript(toMaximized, parser); 874 // closeMapping(toMaximized); 875 // closeUnd(toMaximized); 876 // 877 // addDeprecated(toMaximized); 878 // 879 // closeMapping(toMaximized); 880 // 881 // checkConsistency(toMaximized); 882 // } 883 884 private static class MaxData { 885 Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 886 Map<String, Counter<String>> languagesToScripts = new TreeMap<>(); 887 Map<String, Counter<String>> languagesToRegions = new TreeMap<>(); 888 889 Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 890 Map<String, Counter<String>> scriptsToLanguages = new TreeMap<>(); 891 Map<String, Counter<String>> scriptsToRegions = new TreeMap<>(); 892 893 Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 894 Map<String, Counter<String>> regionsToLanguages = new TreeMap<>(); 895 Map<String, Counter<String>> regionsToScripts = new TreeMap<>(); 896 897 Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<>(); 898 Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of( 899 new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class); 900 901 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of( 902 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 903 TreeSet.class); 904 Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of( 905 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 906 TreeSet.class); 907 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of( 908 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 909 TreeSet.class); 910 911 /** 912 * Add population information. "order" is the negative of the population (makes the first be the highest). 913 * @param language 914 * @param script 915 * @param region 916 * @param order 917 */ add(String language, String script, String region, Double order)918 void add(String language, String script, String region, Double order) { 919 if (SHOW_ADD && language.equals("mis")) { 920 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order); 921 } 922 languages.put(language, Row.of(order, script, region)); 923 // addCounter(languagesToScripts, language, script, order); 924 // addCounter(languagesToRegions, language, region, order); 925 926 scripts.put(script, Row.of(order, language, region)); 927 // addCounter(scriptsToLanguages, script, language, order); 928 // addCounter(scriptsToRegions, script, region, order); 929 930 regions.put(region, Row.of(order, language, script)); 931 // addCounter(regionsToLanguages, region, language, order); 932 // addCounter(regionsToScripts, region, script, order); 933 934 languageScripts.put(Row.of(language, script), Row.of(order, region)); 935 scriptRegions.put(Row.of(script, region), Row.of(order, language)); 936 languageRegions.put(Row.of(language, region), Row.of(order, script)); 937 938 Set<String> containerSet = Containment.leafToContainer(region); 939 if (containerSet != null) { 940 for (String container : containerSet) { 941 942 containersToLangRegion.put(container, Row.of(order, language, script, region)); 943 Counter<R2<String, String>> data = containersToLanguage.get(container); 944 if (data == null) { 945 containersToLanguage.put(container, data = new Counter<>()); 946 } 947 data.add(Row.of(language, script), (long) (double) order); 948 949 } 950 } 951 952 if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order); 953 } 954 // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) { 955 // Counter<String> counter = map.get(key); 956 // if (counter == null) { 957 // map.put(key, counter = new Counter<String>()); 958 // } 959 // counter.add(key2, count.longValue()); 960 // } 961 } 962 963 private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000; 964 private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20; 965 private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000; 966 private static final double UNOFFICIAL_SCALE_DOWN = 0.2; 967 968 private static NumberFormat percent = NumberFormat.getPercentInstance(); 969 private static NumberFormat number = NumberFormat.getIntegerInstance(); 970 tryDifferentAlgorithm(Map<String, String> toMaximized)971 private static void tryDifferentAlgorithm(Map<String, String> toMaximized) { 972 // we are going to try a different approach. 973 // first gather counts for maximized values 974 // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap(); 975 MaxData maxData = new MaxData(); 976 Set<String> cldrLocales = factory.getAvailable(); 977 Set<String> otherTerritories = new TreeSet<>(standardCodes.getGoodAvailableCodes("territory")); 978 979 // process all the information to get the top values for each triple. 980 // each of the combinations of 1 or 2 components gets to be a key. 981 for (String region : supplementalData.getTerritoriesWithPopulationData()) { 982 otherTerritories.remove(region); 983 PopulationData regionData = supplementalData.getPopulationDataForTerritory(region); 984 final double literateTerritoryPopulation = regionData.getLiteratePopulation(); 985 // we need any unofficial language to meet a certain absolute size requirement and proportion size 986 // requirement. 987 // so the bar is x percent of the population, reset up to y absolute size. 988 double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION; 989 if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) { 990 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE; 991 } 992 993 for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) { 994 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region); 995 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation(); 996 double order = -literatePopulation; // negative so we get the inverse order 997 998 if (data.getOfficialStatus() == OfficialStatus.unknown) { 999 final String locale = writtenLanguage + "_" + region; 1000 if (literatePopulation >= minimalLiteratePopulation) { 1001 // ok, skip 1002 } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) { 1003 // ok, skip 1004 } else { 1005 // if (SHOW_ADD) 1006 // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" 1007 // + english.getName(locale) 1008 // + "\t-- too small:\t" + number.format(literatePopulation)); 1009 // continue; 1010 } 1011 order *= UNOFFICIAL_SCALE_DOWN; 1012 if (SHOW_ADD) 1013 System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t" 1014 + english.getName(locale) 1015 + "\t" + number.format(literatePopulation) 1016 + "\t" + percent.format(literatePopulation / literateTerritoryPopulation) 1017 + (cldrLocales.contains(locale) ? "\tin-CLDR" : "")); 1018 } 1019 String script; 1020 String language = writtenLanguage; 1021 final int pos = writtenLanguage.indexOf('_'); 1022 if (pos > 0) { 1023 language = writtenLanguage.substring(0, pos); 1024 script = writtenLanguage.substring(pos + 1); 1025 } else { 1026 script = getScriptForLocale2(language); 1027 } 1028 maxData.add(language, script, region, order); 1029 } 1030 } 1031 1032 LanguageTagParser additionLtp = new LanguageTagParser(); 1033 1034 for (String addition : MAX_ADDITIONS) { 1035 additionLtp.set(addition); 1036 String lan = additionLtp.getLanguage(); 1037 Set<R3<Double, String, String>> key = maxData.languages.get(lan); 1038 if (key == null) { 1039 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0); 1040 } else { 1041 int debug = 0; 1042 } 1043 } 1044 1045 for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) { 1046 String language = entry.getKey(); 1047 final Collection<String> values = entry.getValue(); 1048 if (values.size() != 1) { 1049 continue; // skip, no either way 1050 } 1051 Set<R3<Double, String, String>> old = maxData.languages.get(language); 1052 if (!maxData.languages.containsKey(language)) { 1053 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); 1054 } 1055 } 1056 1057 // add others, with English default 1058 for (String region : otherTerritories) { 1059 if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS 1060 maxData.add("en", "Latn", region, 1.0); 1061 } 1062 1063 // get a reverse mapping, so that we can add the aliases 1064 1065 Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo() 1066 .get("language"); 1067 for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) { 1068 String reason = str.getValue().get1(); 1069 if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) { 1070 continue; 1071 } 1072 List<String> replacements = str.getValue().get0(); 1073 if (replacements == null) { 1074 continue; 1075 } 1076 String goodLanguage = replacements.get(0); 1077 1078 String badLanguage = str.getKey(); 1079 if (badLanguage.contains("_")) { 1080 continue; 1081 } 1082 if (deprecatedISONotInLST.contains(badLanguage)) { 1083 continue; 1084 } 1085 Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage); 1086 if (goodLanguageData == null) { 1087 continue; 1088 } 1089 R3<Double, String, String> value = goodLanguageData.iterator().next(); 1090 final String script = value.get1(); 1091 final String region = value.get2(); 1092 maxData.add(badLanguage, script, region, 1.0); 1093 System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason); 1094 } 1095 1096 // now, get the best for each one 1097 for (String language : maxData.languages.keySet()) { 1098 R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next(); 1099 final Comparable<String> script = value.get1(); 1100 final Comparable<String> region = value.get2(); 1101 add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", LocaleOverride.REPLACE_EXISTING, 1102 SHOW_ADD); 1103 } 1104 for (String language : maxData.languagesToScripts.keySet()) { 1105 String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next(); 1106 add(language, language + "_" + script, toMaximized, "L->S", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1107 } 1108 for (String language : maxData.languagesToRegions.keySet()) { 1109 String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next(); 1110 add(language, language + "_" + region, toMaximized, "L->R", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1111 } 1112 1113 for (String script : maxData.scripts.keySet()) { 1114 R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next(); 1115 final Comparable<String> language = value.get1(); 1116 final Comparable<String> region = value.get2(); 1117 add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR", 1118 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1119 } 1120 for (String script : maxData.scriptsToLanguages.keySet()) { 1121 String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next(); 1122 add("und_" + script, language + "_" + script, toMaximized, "S->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1123 } 1124 for (String script : maxData.scriptsToRegions.keySet()) { 1125 String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next(); 1126 add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", LocaleOverride.REPLACE_EXISTING, 1127 SHOW_ADD); 1128 } 1129 1130 for (String region : maxData.regions.keySet()) { 1131 R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next(); 1132 final Comparable<String> language = value.get1(); 1133 final Comparable<String> script = value.get2(); 1134 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS", 1135 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1136 } 1137 for (String region : maxData.regionsToLanguages.keySet()) { 1138 String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next(); 1139 add("und_" + region, language + "_" + region, toMaximized, "R->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1140 } 1141 for (String region : maxData.regionsToScripts.keySet()) { 1142 String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next(); 1143 add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", LocaleOverride.REPLACE_EXISTING, 1144 SHOW_ADD); 1145 } 1146 1147 for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) { 1148 String region = containerAndInfo.getKey(); 1149 if (region.equals("001")) { 1150 continue; 1151 } 1152 Counter<R2<String, String>> data = containerAndInfo.getValue(); 1153 Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true); 1154 if (SHOW_CONTAINERS) { // debug 1155 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null))); 1156 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region)); 1157 } 1158 R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative 1159 final Comparable<String> language = value.get0(); 1160 final Comparable<String> script = value.get1(); 1161 1162 // fix special cases like es-419, where a locale exists. 1163 // for those cases, what we add as output is the container. Otherwise the region. 1164 Set<String> skipLanguages = cldrContainerToLanguages.get(region); 1165 if (skipLanguages != null 1166 && skipLanguages.contains(language)) { 1167 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS", 1168 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1169 continue; 1170 } 1171 1172 // we now have the best language and script. Find the best region for that 1173 for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) { 1174 final Comparable<String> language2 = e.get1(); 1175 final Comparable<String> script2 = e.get2(); 1176 if (language2.equals(language) && script2.equals(script)) { 1177 add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS", 1178 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1179 break; 1180 } 1181 } 1182 } 1183 1184 for (R2<String, String> languageScript : maxData.languageScripts.keySet()) { 1185 R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next(); 1186 final Comparable<String> language = languageScript.get0(); 1187 final Comparable<String> script = languageScript.get1(); 1188 final Comparable<String> region = value.get1(); 1189 add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R", 1190 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1191 } 1192 1193 for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) { 1194 R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next(); 1195 final Comparable<String> script = scriptRegion.get0(); 1196 final Comparable<String> region = scriptRegion.get1(); 1197 final Comparable<String> language = value.get1(); 1198 add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L", 1199 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1200 } 1201 1202 for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) { 1203 R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next(); 1204 final Comparable<String> language = languageRegion.get0(); 1205 final Comparable<String> region = languageRegion.get1(); 1206 final Comparable<String> script = value.get1(); 1207 add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S", 1208 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1209 } 1210 1211 // get the script info from metadata as fallback 1212 1213 1214 TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts()); 1215 for (String script : sorted) { 1216 Info i = ScriptMetadata.getInfo(script); 1217 String likelyLanguage = i.likelyLanguage; 1218 if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) { 1219 likelyLanguage = "und"; 1220 } 1221 String originCountry = i.originCountry; 1222 final String result = likelyLanguage + "_" + script + "_" + originCountry; 1223 add("und_" + script, result, toMaximized, "S->LR•", 1224 LocaleOverride.KEEP_EXISTING, SHOW_ADD); 1225 add(likelyLanguage, result, toMaximized, "L->SR•", 1226 LocaleOverride.KEEP_EXISTING, SHOW_ADD); 1227 } 1228 1229 // add overrides 1230 for (String key : LANGUAGE_OVERRIDES.keySet()) { 1231 add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", LocaleOverride.REPLACE_EXISTING, true); 1232 } 1233 1234 // Make sure that the mapping is Idempotent. If we have A ==> B, we must never have B ==> C 1235 // We run this check until we get no problems. 1236 Set<List<String>> problems = new HashSet<>(); 1237 1238 while (true) { 1239 problems.clear(); 1240 for (Entry<String, String> entry : toMaximized.entrySet()) { 1241 String source = entry.getKey(); 1242 String target = entry.getValue(); 1243 if (target.contains("_Zzzz") || target.contains("_ZZ")) { // these are special cases 1244 continue; 1245 } 1246 String idempotentCandidate = LikelySubtags.maximize(target, toMaximized); 1247 1248 if (idempotentCandidate == null) { 1249 System.out.println("Can't maximize " + target); 1250 } else if (!idempotentCandidate.equals(target)) { 1251 problems.add(ImmutableList.of(source, target, idempotentCandidate)); 1252 } 1253 } 1254 if (problems.isEmpty()) { 1255 break; 1256 } 1257 for (List<String> row : problems) { 1258 System.out.println("Idempotence: dropping mapping " + row.get(0) + " to " + row.get(1) + " since the target maps further to " + row.get(2)); 1259 toMaximized.remove(row.get(0)); 1260 } 1261 } 1262 } 1263 shorten(Object data)1264 public static String shorten(Object data) { 1265 String info = data.toString(); 1266 if (info.length() > 255) { 1267 info = info.substring(0, 127) + "…"; 1268 } 1269 return info; 1270 } 1271 doAlt(Map<String, String> toMaximized)1272 private static void doAlt(Map<String, String> toMaximized) { 1273 // TODO Auto-generated method stub 1274 Map<String, String> temp = new TreeMap<>(); 1275 for (String locale : toMaximized.keySet()) { 1276 String target = toMaximized.get(locale); 1277 temp.put(toAlt(locale, true), toAlt(target, true)); 1278 } 1279 toMaximized.clear(); 1280 toMaximized.putAll(temp); 1281 } 1282 maximize(String languageTag, Map<String, String> toMaximized)1283 public static String maximize(String languageTag, Map<String, String> toMaximized) { 1284 LanguageTagParser ltp = new LanguageTagParser(); 1285 1286 // clean up the input by removing Zzzz, ZZ, and changing "" into und. 1287 ltp.set(languageTag); 1288 String language = ltp.getLanguage(); 1289 String region = ltp.getRegion(); 1290 String script = ltp.getScript(); 1291 boolean changed = false; 1292 if (language.equals("")) { 1293 ltp.setLanguage(language = "und"); 1294 changed = true; 1295 } 1296 if (region.equals(UNKNOWN_SCRIPT)) { 1297 ltp.setScript(script = ""); 1298 changed = true; 1299 } 1300 if (ltp.getRegion().equals(UNKNOWN_REGION)) { 1301 ltp.setRegion(region = ""); 1302 changed = true; 1303 } 1304 if (changed) { 1305 languageTag = ltp.toString(); 1306 } 1307 // check whole 1308 String result = toMaximized.get(languageTag); 1309 if (result != null) { 1310 return result; 1311 } 1312 // try empty region 1313 if (region.length() != 0) { 1314 result = toMaximized.get(ltp.setRegion("").toString()); 1315 if (result != null) { 1316 return ltp.set(result).setRegion(region).toString(); 1317 } 1318 ltp.setRegion(region); // restore 1319 } 1320 // try empty script 1321 if (script.length() != 0) { 1322 result = toMaximized.get(ltp.setScript("").toString()); 1323 if (result != null) { 1324 return ltp.set(result).setScript(script).toString(); 1325 } 1326 // try empty script and region 1327 if (region.length() != 0) { 1328 result = toMaximized.get(ltp.setRegion("").toString()); 1329 if (result != null) { 1330 return ltp.set(result).setScript(script).setRegion(region).toString(); 1331 } 1332 } 1333 } 1334 if (!language.equals("und") && script.length() != 0 && region.length() != 0) { 1335 return languageTag; // it was ok, and we couldn't do anything with it 1336 } 1337 return null; // couldn't maximize 1338 } 1339 minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1340 public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) { 1341 if (input.equals("nb_Latn_SJ")) { 1342 System.out.print(""); // debug 1343 } 1344 String maximized = maximize(input, toMaximized); 1345 if (maximized == null) { 1346 return null; // failed 1347 } 1348 LanguageTagParser ltp = new LanguageTagParser().set(maximized); 1349 String language = ltp.getLanguage(); 1350 String region = ltp.getRegion(); 1351 String script = ltp.getScript(); 1352 // try building up from shorter to longer, and find the first that matches 1353 // could be more optimized, but for this code we want simplest 1354 String[] trials = { language, 1355 language + TAG_SEPARATOR + (favorRegion ? region : script), 1356 language + TAG_SEPARATOR + (!favorRegion ? region : script) }; 1357 for (String trial : trials) { 1358 String newMaximized = maximize(trial, toMaximized); 1359 if (maximized.equals(newMaximized)) { 1360 return trial; 1361 } 1362 } 1363 return maximized; 1364 } 1365 1366 // /** 1367 // * Verify that we can map from each language, script, and country to something. 1368 // * @param toMaximized 1369 // */ 1370 // private static void checkConsistency(Map<String, String> toMaximized) { 1371 // Map<String,String> needMappings = new TreeMap(); 1372 // LanguageTagParser parser = new LanguageTagParser(); 1373 // for (String maximized : new TreeSet<String>(toMaximized.values())) { 1374 // parser.set(maximized); 1375 // final String language = parser.getLanguage(); 1376 // final String script = parser.getScript(); 1377 // final String region = parser.getRegion(); 1378 // if (language.length() == 0 || script.length() == 0 || region.length() == 0) { 1379 // failure(" { \"" + maximized + "\", \"" + maximized + "\" }, // " + english.getName(maximized) + 1380 // "\t\tFailed-Consistency"); 1381 // continue; 1382 // } 1383 // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency"); 1384 // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency"); 1385 // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1386 // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency"); 1387 // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1388 // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency"); 1389 // } 1390 // toMaximized.putAll(needMappings); 1391 // } 1392 1393 // private static void failure(String string) { 1394 // System.out.println(string); 1395 // errorCount++; 1396 // } 1397 1398 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String> 1399 // otherToCheck, String kind) { 1400 // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind); 1401 // } 1402 1403 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey, 1404 // Set<String> skipValue, String kind) { 1405 // if (!key.equals(value) 1406 // && !toAdd.containsKey(key) 1407 // && (skipKey == null || !skipKey.contains(key)) 1408 // && (skipValue == null || !skipValue.contains(value))) { 1409 // add(key, value, toAdd, kind); 1410 // } 1411 // } 1412 1413 enum LocaleOverride { 1414 KEEP_EXISTING, REPLACE_EXISTING 1415 } 1416 add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, boolean showAction)1417 private static void add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, 1418 boolean showAction) { 1419 if (SHOW_ADD && key.startsWith("mis")) { 1420 int debug = 1; 1421 } 1422 if (key.equals(DEBUG_ADD_KEY)) { 1423 System.out.println("*debug*"); 1424 } 1425 String oldValue = toAdd.get(key); 1426 if (oldValue == null) { 1427 if (showAction) { 1428 System.out.println("\tAdding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind); 1429 } 1430 } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) { 1431 // if (showAction) { 1432 // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind); 1433 // } 1434 return; 1435 } else { 1436 if (showAction) { 1437 System.out.println("\tReplacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind); 1438 } 1439 } 1440 toAdd.put(key, value); 1441 } 1442 getName(String value)1443 private static String getName(String value) { 1444 return ConvertLanguageData.getLanguageCodeAndName(value); 1445 } 1446 1447 // private static void addCountries(Map<String, String> toMaximized) { 1448 // Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap(); 1449 // 1450 // for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 1451 // Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory); 1452 // String biggestOfficial = null; 1453 // double biggest = -1; 1454 // for (String language : languages) { 1455 // PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory); 1456 // // add to info about script 1457 // 1458 // String script = getScriptForLocale(language); 1459 // if (script != null) { 1460 // Map<String, Double> languageInfo = scriptToLanguageToSize.get(script); 1461 // if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap()); 1462 // String baseLanguage = language; 1463 // int pos = baseLanguage.indexOf('_'); 1464 // if (pos >= 0) { 1465 // baseLanguage = baseLanguage.substring(0,pos); 1466 // } 1467 // Double size = languageInfo.get(baseLanguage); 1468 // languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation()); 1469 // } 1470 // 1471 // 1472 // final OfficialStatus officialStatus = info.getOfficialStatus(); 1473 // if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) { 1474 // double size2 = info.getLiteratePopulation(); 1475 // if (biggest < size2) { 1476 // biggest = size2; 1477 // biggestOfficial = language; 1478 // } 1479 // } 1480 // } 1481 // if (biggestOfficial != null) { 1482 // final String replacementTag = "und_" + territory; 1483 // String maximized = biggestOfficial + "_" + territory; 1484 // toMaximized.put(replacementTag, maximized); 1485 // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory"); 1486 // } 1487 // } 1488 // 1489 // for (String script : scriptToLanguageToSize.keySet()) { 1490 // String biggestOfficial = null; 1491 // double biggest = -1; 1492 // 1493 // final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script); 1494 // for (String language : languageToSize.keySet()) { 1495 // double size = languageToSize.get(language); 1496 // if (biggest < size) { 1497 // biggest = size; 1498 // biggestOfficial = language; 1499 // } 1500 // } 1501 // if (biggestOfficial != null) { 1502 // final String replacementTag = "und_" + script; 1503 // String maximized = biggestOfficial + "_" + script; 1504 // toMaximized.put(replacementTag, maximized); 1505 // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script"); 1506 // } 1507 // } 1508 // } 1509 1510 // private static void closeUnd(Map<String, String> toMaximized) { 1511 // Map<String,String> toAdd = new TreeMap<String,String>(); 1512 // for (String oldSource : toMaximized.keySet()) { 1513 // String maximized = toMaximized.get(oldSource); 1514 // if (!maximized.startsWith("und")) { 1515 // int pos = maximized.indexOf("_"); 1516 // if (pos >= 0) { 1517 // addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd"); 1518 // } 1519 // } 1520 // } 1521 // toMaximized.putAll(toAdd); 1522 // } 1523 1524 /** 1525 * Generate tags where the deprecated values map to the expanded values 1526 * 1527 * @param toMaximized 1528 */ 1529 // private static void addDeprecated(Map<String, String> toMaximized) { 1530 // Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo(); 1531 // LanguageTagParser temp = new LanguageTagParser(); 1532 // LanguageTagParser tagParsed = new LanguageTagParser(); 1533 // LanguageTagParser replacementParsed = new LanguageTagParser(); 1534 // Map<String,String> toAdd = new TreeMap<String,String>(); 1535 // while (true) { 1536 // toAdd.clear(); 1537 // for (String type : typeToTagToReplacement.keySet()) { 1538 // if (type.equals("variant") || type.equals("zone")) continue; 1539 // boolean addUnd = !type.equals("language"); 1540 // 1541 // Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type); 1542 // System.out.println("*" + type + " = " + tagToReplacement); 1543 // 1544 // for (String tag: tagToReplacement.keySet()) { 1545 // 1546 // final List<String> list = tagToReplacement.get(tag); 1547 // if (list == null) continue; // we don't have any information 1548 // String replacement = list.get(0); 1549 // 1550 // // only do multiples 1551 // if (tag.contains("_") || !replacement.contains("_")) { 1552 // continue; 1553 // } 1554 // 1555 // // we now have a tag and a replacement value 1556 // // make parsers that we can use 1557 // try { 1558 // tagParsed.set(addUnd ? "und-" + tag : tag); 1559 // replacementParsed.set(addUnd ? "und-" + replacement : replacement); 1560 // } catch (RuntimeException e) { 1561 // continue; 1562 // } 1563 // addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated"); 1564 // 1565 // for (String locale : toMaximized.keySet()) { 1566 // String maximized = toMaximized.get(locale); 1567 // addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized); 1568 // addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized); 1569 // } 1570 // } 1571 // } 1572 // if (toAdd.size() == 0) { 1573 // break; 1574 // } 1575 // toMaximized.putAll(toAdd); 1576 // } 1577 // } 1578 1579 // private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed, 1580 // LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) { 1581 // if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) { 1582 // return; 1583 // } 1584 // if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) { 1585 // return; 1586 // } 1587 // if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) { 1588 // return; 1589 // } 1590 // if (!replacementParsed.getLanguage().equals("und")) { 1591 // locale.setLanguage(replacementParsed.getLanguage()); 1592 // } 1593 // if (!replacementParsed.getScript().equals("")) { 1594 // locale.setScript(replacementParsed.getScript()); 1595 // } 1596 // if (!replacementParsed.getRegion().equals("")) { 1597 // locale.setRegion(replacementParsed.getRegion()); 1598 // } 1599 // addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated"); 1600 // } 1601 1602 // private static int getSubtagPosition(String locale, String subtags) { 1603 // int pos = -1; 1604 // while (true) { 1605 // pos = locale.indexOf(subtags, pos + 1); 1606 // if (pos < 0) return -1; 1607 // // make sure boundaries are ok 1608 // if (pos != 0) { 1609 // char charBefore = locale.charAt(pos-1); 1610 // if (charBefore != '_' && charBefore != '_') return -1; 1611 // } 1612 // int limit = pos + subtags.length(); 1613 // if (limit != locale.length()) { 1614 // char charAfter = locale.charAt(limit); 1615 // if (charAfter != '_' && charAfter != '_') return -1; 1616 // } 1617 // return pos; 1618 // } 1619 // } 1620 1621 /* 1622 * Format 1623 * const DefaultSubtags default_subtags[] = { 1624 * { 1625 * // Afar => Afar (Latin, Ethiopia) 1626 * "aa", 1627 * "aa_Latn_ET" 1628 * },{ 1629 * // Afrikaans => Afrikaans (Latin, South Africa) 1630 * "af", 1631 * "af_Latn_ZA" 1632 * },{ 1633 */ 1634 printLikelySubtags(Map<String, String> fluffup)1635 private static void printLikelySubtags(Map<String, String> fluffup) throws IOException { 1636 1637 PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, 1638 "/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt")); 1639 String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " "; 1640 String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {" 1641 : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR 1642 + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">" 1643 + CldrUtility.LINE_SEPARATOR 1644 + "<!--" 1645 + CldrUtility.LINE_SEPARATOR 1646 + CldrUtility.getCopyrightString() 1647 + CldrUtility.LINE_SEPARATOR 1648 + "-->" 1649 + CldrUtility.LINE_SEPARATOR 1650 + "<!--" 1651 + CldrUtility.LINE_SEPARATOR 1652 + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR 1653 + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR 1654 + "not be patched by hand, as any changes made in that fashion may be lost." 1655 + CldrUtility.LINE_SEPARATOR 1656 + "-->" 1657 + CldrUtility.LINE_SEPARATOR 1658 + "<supplementalData>" + CldrUtility.LINE_SEPARATOR 1659 + " <version number=\"$" + 1660 "Revision$\"/>" + CldrUtility.LINE_SEPARATOR 1661 + " <likelySubtags>"; 1662 String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};" 1663 : " </likelySubtags>" + CldrUtility.LINE_SEPARATOR 1664 + "</supplementalData>"; 1665 out.println(header); 1666 boolean first = true; 1667 Set<String> keys = new TreeSet<>(new LocaleStringComparator()); 1668 keys.addAll(fluffup.keySet()); 1669 for (String printingLocale : keys) { 1670 String printingTarget = fluffup.get(printingLocale); 1671 String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing 1672 + printingName(printingTarget, spacing); 1673 1674 if (OUTPUT_STYLE == OutputStyle.XML) { 1675 out.println("\t\t<likelySubtag from=\"" + printingLocale + 1676 "\" to=\"" + printingTarget + "\"" + 1677 "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->"); 1678 } else { 1679 if (first) { 1680 first = false; 1681 } else { 1682 out.print(","); 1683 } 1684 if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { 1685 comment = printingName(printingLocale, spacing) + SEPARATOR + " // " + spacing + "=>" + spacing 1686 + printingName(printingTarget, spacing); 1687 } 1688 out.print( 1689 " {" 1690 + SEPARATOR + " // " + comment 1691 + SEPARATOR + " \"" + printingLocale + "\"," 1692 + SEPARATOR + " \"" + printingTarget + "\"" 1693 + CldrUtility.LINE_SEPARATOR + " }"); 1694 } 1695 } 1696 out.println(footer); 1697 out.close(); 1698 } 1699 printingName(String locale, String spacing)1700 public static String printingName(String locale, String spacing) { 1701 if (locale == null) { 1702 return null; 1703 } 1704 LanguageTagParser parser = new LanguageTagParser().set(locale); 1705 String lang = parser.getLanguage(); 1706 String script = parser.getScript(); 1707 String region = parser.getRegion(); 1708 return "{" + spacing + 1709 (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing + 1710 (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing 1711 + 1712 (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing 1713 + "}"; 1714 } 1715 1716 private static final String[][] ALT_REVERSAL = { 1717 //{ "no", "nb" }, 1718 //{ "nb", "no" }, 1719 { "he", "iw" }, 1720 { "iw", "he" }, 1721 }; 1722 toAlt(String locale, boolean change)1723 public static String toAlt(String locale, boolean change) { 1724 if (!change || locale == null) { 1725 return locale; 1726 } 1727 String firstTag = getFirstTag(locale); 1728 for (String[] pair : ALT_REVERSAL) { 1729 if (firstTag.equals(pair[0])) { 1730 locale = pair[1] + locale.substring(pair[1].length()); 1731 break; 1732 } 1733 } 1734 locale = locale.replace("_", "-"); 1735 return locale; 1736 } 1737 getFirstTag(String locale)1738 private static String getFirstTag(String locale) { 1739 int pos = locale.indexOf('_'); 1740 return pos < 0 ? locale : locale.substring(0, pos); 1741 } 1742 1743 // private static Map<String, String> getBackMapping(Map<String, String> fluffup) { 1744 // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR); 1745 // for (String source : fluffup.keySet()) { 1746 // if (source.startsWith("und")) { 1747 // continue; 1748 // } 1749 // String maximized = fluffup.get(source); 1750 // backMap.put(maximized, source); // put in right order 1751 // } 1752 // Map<String,String> returnBackMap = new TreeMap(); 1753 // for (String maximized : backMap.keySet()) { 1754 // final Set<String> all = backMap.getAll(maximized); 1755 // final String minimized = all.iterator().next(); 1756 // returnBackMap.put(maximized, minimized); 1757 // } 1758 // return returnBackMap; 1759 // } 1760 1761 /** 1762 * Language tags are presumed to share the first language, except possibly "und". Best is least 1763 */ 1764 // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() { 1765 // LanguageTagParser p1 = new LanguageTagParser(); 1766 // LanguageTagParser p2 = new LanguageTagParser(); 1767 // public int compare(String o1, String o2) { 1768 // if (o1.equals(o2)) return 0; 1769 // p1.set(o1); 1770 // p2.set(o2); 1771 // String lang1 = p1.getLanguage(); 1772 // String lang2 = p2.getLanguage(); 1773 // 1774 // // compare languages first 1775 // // put und at the end 1776 // int result = lang1.compareTo(lang2); 1777 // if (result != 0) { 1778 // if (lang1.equals("und")) return 1; 1779 // if (lang2.equals("und")) return -1; 1780 // return result; 1781 // } 1782 // 1783 // // now scripts and regions. 1784 // // if they have different numbers of fields, the shorter wins. 1785 // // If there are two fields, region is lowest. 1786 // // The simplest way is to just compare scripts first 1787 // // so zh-TW < zh-Hant, because we first compare "" to Hant 1788 // String script1 = p1.getScript(); 1789 // String script2 = p2.getScript(); 1790 // int scriptOrder = script1.compareTo(script2); 1791 // if (scriptOrder != 0) return scriptOrder; 1792 // 1793 // String region1 = p1.getRegion(); 1794 // String region2 = p2.getRegion(); 1795 // int regionOrder = region1.compareTo(region2); 1796 // if (regionOrder != 0) return regionOrder; 1797 // 1798 // return o1.compareTo(o2); 1799 // } 1800 // 1801 // }; 1802 minimize(Map<String, String> fluffup)1803 public static void minimize(Map<String, String> fluffup) { 1804 LanguageTagParser parser = new LanguageTagParser(); 1805 LanguageTagParser targetParser = new LanguageTagParser(); 1806 Set<String> removals = new TreeSet<>(); 1807 while (true) { 1808 removals.clear(); 1809 for (String locale : fluffup.keySet()) { 1810 String target = fluffup.get(locale); 1811 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) { 1812 removals.add(locale); 1813 if (SHOW_ADD) 1814 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1815 + "\t\t - Unknown Region in target"); 1816 continue; 1817 } 1818 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) { 1819 removals.add(locale); 1820 if (SHOW_ADD) 1821 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1822 + "\t\t - Unknown Script in target"); 1823 continue; 1824 } 1825 1826 String region = parser.set(locale).getRegion(); 1827 if (region.length() != 0) { 1828 if (region.equals(UNKNOWN_REGION)) { 1829 removals.add(locale); 1830 if (SHOW_ADD) 1831 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1832 + "\t\t - Unknown Region in source"); 1833 continue; 1834 } 1835 parser.setRegion(""); 1836 String newLocale = parser.toString(); 1837 String newTarget = fluffup.get(newLocale); 1838 if (newTarget != null) { 1839 newTarget = targetParser.set(newTarget).setRegion(region).toString(); 1840 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1841 removals.add(locale); 1842 if (SHOW_ADD) 1843 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1844 + newLocale); 1845 continue; 1846 } 1847 } 1848 } 1849 String script = parser.set(locale).getScript(); 1850 if (locale.equals(DEBUG_ADD_KEY)) { 1851 System.out.println("*debug*"); 1852 } 1853 if (script.length() != 0) { 1854 if (script.equals(UNKNOWN_SCRIPT)) { 1855 removals.add(locale); 1856 if (SHOW_ADD) 1857 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script"); 1858 continue; 1859 } 1860 parser.setScript(""); 1861 String newLocale = parser.toString(); 1862 String newTarget = fluffup.get(newLocale); 1863 if (newTarget != null) { 1864 newTarget = targetParser.set(newTarget).setScript(script).toString(); 1865 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1866 removals.add(locale); 1867 if (SHOW_ADD) 1868 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1869 + newLocale); 1870 continue; 1871 } 1872 } 1873 } 1874 } 1875 if (removals.size() == 0) { 1876 break; 1877 } 1878 for (String locale : removals) { 1879 fluffup.remove(locale); 1880 } 1881 } 1882 } 1883 1884 // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) { 1885 // // add script 1886 // Map<String, String> temp = new TreeMap<String, String>(); 1887 // while (true) { 1888 // temp.clear(); 1889 // for (String target : new TreeSet<String>(fluffup.values())) { 1890 // parser.set(target); 1891 // final String territory = parser.getRegion(); 1892 // if (territory.length() == 0) { 1893 // continue; 1894 // } 1895 // parser.setRegion(""); 1896 // String possibleSource = parser.toString(); 1897 // if (fluffup.containsKey(possibleSource)) { 1898 // continue; 1899 // } 1900 // String other = temp.get(possibleSource); 1901 // if (other != null) { 1902 // if (!target.equals(other)) { 1903 // System.out.println("**Failure with multiple sources in addLanguageScript: " 1904 // + possibleSource + "\t=>\t" + target + ", " + other); 1905 // } 1906 // continue; 1907 // } 1908 // temp.put(possibleSource, target); 1909 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script"); 1910 // } 1911 // if (temp.size() == 0) { 1912 // break; 1913 // } 1914 // fluffup.putAll(temp); 1915 // } 1916 // 1917 // } 1918 1919 // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) { 1920 // // add script 1921 // Map<String, String> temp = new TreeMap<String, String>(); 1922 // while (true) { 1923 // temp.clear(); 1924 // for (String target : new TreeSet<String>(fluffup.values())) { 1925 // parser.set(target); 1926 // String script = parser.getScript(); 1927 // if (script.length() == 0) { 1928 // continue; 1929 // } 1930 // parser.setScript(""); 1931 // String possibleSource = parser.toString(); 1932 // if (fluffup.containsKey(possibleSource)) { 1933 // continue; 1934 // } 1935 // String other = temp.get(possibleSource); 1936 // 1937 // if (other != null) { 1938 // if (!target.equals(other)) { 1939 // script = getScriptForLocale(possibleSource); 1940 // if (script == null) { 1941 // System.out.println("**Failure with multiple sources in addLanguageCountry: " 1942 // + possibleSource + "\t=>\t" + target + ", " + other); 1943 // continue; // error message in routine 1944 // } 1945 // parser.setScript(script); 1946 // target = parser.toString(); 1947 // } 1948 // } 1949 // 1950 // temp.put(possibleSource, target); 1951 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry"); 1952 // } 1953 // if (temp.size() == 0) { 1954 // break; 1955 // } 1956 // fluffup.putAll(temp); 1957 // } 1958 // 1959 // } 1960 1961 // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) { 1962 // // add script 1963 // Map<String, String> temp = new TreeMap<String, String>(); 1964 // while (true) { 1965 // temp.clear(); 1966 // Set skipTarget = fluffup.keySet(); 1967 // for (String locale : fluffup.keySet()) { 1968 // String target = fluffup.get(locale); 1969 // parser.set(target); 1970 // if (parser.getScript().length() != 0) { 1971 // continue; 1972 // } 1973 // String script = getScriptForLocale(target); 1974 // 1975 // if (script == null) { 1976 // continue; // error message in routine 1977 // } 1978 // parser.setScript(script); 1979 // String furtherTarget = parser.toString(); 1980 // addIfNotIn(target, furtherTarget, temp, fluffup, "Script"); 1981 // } 1982 // if (temp.size() == 0) { 1983 // break; 1984 // } 1985 // fluffup.putAll(temp); 1986 // } 1987 // } 1988 1989 // private static String getScriptForLocale(String locale) { 1990 // String result = getScriptForLocale2(locale); 1991 // if (result != null) return result; 1992 // int pos = locale.indexOf('_'); 1993 // if (pos >= 0) { 1994 // result = getScriptForLocale2(locale.substring(0,pos)); 1995 // } 1996 // return result; 1997 // } 1998 1999 private static String UNKNOWN_SCRIPT = "Zzzz"; 2000 private static String UNKNOWN_REGION = "ZZ"; 2001 getScriptForLocale2(String locale)2002 private static String getScriptForLocale2(String locale) { 2003 String result = localeToScriptCache.get(locale); 2004 if (result != null) { 2005 return result; 2006 } 2007 if (locale.equals("ky")) { 2008 int debug = 0; 2009 } 2010 try { 2011 Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale); 2012 if (data != null) { 2013 for (BasicLanguageData datum : data.values()) { 2014 final Set<String> scripts = datum.getScripts(); 2015 boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary; 2016 if (scripts.size() != 1) { 2017 if (scripts.size() > 1 && isPrimary) { 2018 break; 2019 } 2020 continue; 2021 } 2022 String script = scripts.iterator().next(); 2023 if (isPrimary) { 2024 return result = script; 2025 } else if (result == null) { 2026 result = script; 2027 } 2028 } 2029 if (result != null) { 2030 return result; 2031 } 2032 } 2033 CLDRFile cldrFile; 2034 try { 2035 cldrFile = factory.make(locale, true); 2036 } catch (RuntimeException e) { 2037 result = FALLBACK_SCRIPTS.get(locale); 2038 if (result == null) { 2039 System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale)); 2040 return result = UNKNOWN_SCRIPT; 2041 } else { 2042 return result; 2043 } 2044 } 2045 UnicodeSet exemplars = getExemplarSet(cldrFile, ""); 2046 Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars); 2047 CLDRScripts.remove(UNKNOWN_SCRIPT); 2048 if (CLDRScripts.size() == 1) { 2049 return result = CLDRScripts.iterator().next(); 2050 } else if (CLDRScripts.size() == 0) { 2051 System.out.println("**Failed to get script for:\t" + locale); 2052 return result = UNKNOWN_SCRIPT; 2053 } else { 2054 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts); 2055 return result = UNKNOWN_SCRIPT; 2056 } 2057 } finally { 2058 if (result.equals(UNKNOWN_SCRIPT)) { 2059 String temp = LANGUAGE_OVERRIDES.get(locale); 2060 if (temp != null) { 2061 result = new LanguageTagParser().set(temp).getScript(); 2062 System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result); 2063 } 2064 } 2065 localeToScriptCache.put(locale, result); 2066 if (SHOW_ADD) 2067 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t" 2068 + english.getName(CLDRFile.SCRIPT_NAME, result)); 2069 } 2070 } 2071 2072 // private static Map<String, String> closeMapping(Map<String, String> fluffup) { 2073 // if (SHOW_ADD) System.out.flush(); 2074 // Map<String,String> temp = new TreeMap<String,String>(); 2075 // while (true) { 2076 // temp.clear(); 2077 // for (String locale : fluffup.keySet()) { 2078 // String target = fluffup.get(locale); 2079 // if (target.equals("si_Sinh") || target.equals("zh-Hani")) { 2080 // System.out.println("????"); 2081 // } 2082 // String furtherTarget = fluffup.get(target); 2083 // if (furtherTarget == null) { 2084 // continue; 2085 // } 2086 // addIfNotIn(locale, furtherTarget, temp, null, "Close"); 2087 // } 2088 // if (temp.size() == 0) { 2089 // break; 2090 // } 2091 // fluffup.putAll(temp); 2092 // } 2093 // if (SHOW_ADD) System.out.flush(); 2094 // return temp; 2095 // } 2096 getScriptsFromUnicodeSet(UnicodeSet exemplars)2097 public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) { 2098 // use bits first, since that's faster 2099 BitSet scriptBits = new BitSet(); 2100 boolean show = false; 2101 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { 2102 if (show) 2103 System.out.println(Integer.toHexString(it.codepoint)); 2104 if (it.codepoint != UnicodeSetIterator.IS_STRING) { 2105 scriptBits.set(UScript.getScript(it.codepoint)); 2106 } else { 2107 int cp; 2108 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) { 2109 scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i))); 2110 } 2111 } 2112 } 2113 scriptBits.clear(UScript.COMMON); 2114 scriptBits.clear(UScript.INHERITED); 2115 Set<String> scripts = new TreeSet<>(); 2116 for (int j = 0; j < scriptBits.size(); ++j) { 2117 if (scriptBits.get(j)) { 2118 scripts.add(UScript.getShortName(j)); 2119 } 2120 } 2121 return scripts; 2122 } 2123 getExemplarSet(CLDRFile cldrfile, String type)2124 public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) { 2125 if (type.length() != 0) 2126 type = "[@type=\"" + type + "\"]"; 2127 String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" 2128 + type); 2129 if (v == null) 2130 return new UnicodeSet(); 2131 return new UnicodeSet(v); 2132 } 2133 2134 // private static String[][] SpecialCases = { 2135 // { "zh_Hani", "zh_Hans_CN"}, 2136 // { "si_Sinh", "si_Sinh_LK"}, 2137 // { "ii", "ii_CN"}, // Sichuan Yi (Yi) 2138 // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics) 2139 // { "und", "en"}, // English default 2140 // }; 2141 showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)2142 static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) { 2143 Set<String> errors = new LinkedHashSet<>(); 2144 Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents( 2145 ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors); 2146 if (!errors.isEmpty()) { 2147 System.out.println(Joiner.on("\n").join(errors)); 2148 errors.clear(); 2149 } 2150 Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 2151 new TreeMap<String, String>(), errors); 2152 if (!errors.isEmpty()) { 2153 System.out.println("Default Content errors: " + Joiner.on("\n").join(errors)); 2154 errors.clear(); 2155 } 2156 Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent, 2157 "ar", "ar_001"); 2158 System.out.println(Joiner.on("\n").join(changes)); 2159 defaultLocaleContent.clear(); 2160 defaultLocaleContent.addAll(newDefaultContent.values()); 2161 newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 2162 new TreeMap<String, String>(), errors); 2163 if (!errors.isEmpty()) { 2164 System.out.println("***New Errors: " + Joiner.on("\n").join(errors)); 2165 } 2166 } 2167 compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)2168 private static Set<String> compareMapsAndFixNew(String title, 2169 Map<String, String> oldContent, 2170 Map<String, String> newContent, String... allowedOverrideValues) { 2171 Map<String, String> allowedOverrideValuesTest = new HashMap<>(); 2172 for (int i = 0; i < allowedOverrideValues.length; i += 2) { 2173 allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]); 2174 } 2175 Set<String> changes = new TreeSet<>(); 2176 for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet()) 2177 .addAll(oldContent.keySet()).get()) { 2178 String oldValue = oldContent.get(parent); 2179 String newValue = newContent.get(parent); 2180 String overrideValue = allowedOverrideValuesTest.get(parent); 2181 if (overrideValue != null) { 2182 newContent.put(parent, overrideValue); 2183 newValue = overrideValue; 2184 } 2185 if (CldrUtility.equals(oldValue, newValue)) { 2186 continue; 2187 } 2188 String message; 2189 if (oldValue == null) { 2190 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2191 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2192 newContent.put(parent, newValue); 2193 } else if (newValue == null) { 2194 if (SUPPRESS_CHANGES) { 2195 message = "Suppressing removal of " 2196 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2197 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2198 newContent.put(parent, oldValue); 2199 } else { 2200 message = "Removing " 2201 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2202 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2203 newContent.remove(oldValue); 2204 } 2205 } else { 2206 if (SUPPRESS_CHANGES) { 2207 message = "Suppressing change of " 2208 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2209 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2210 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2211 newContent.remove(newValue); 2212 newContent.put(parent, oldValue); 2213 } else { 2214 message = "Changing " 2215 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2216 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2217 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2218 newContent.remove(oldValue); 2219 newContent.put(parent, newValue); 2220 } 2221 } 2222 changes.add(title + message); 2223 } 2224 return changes; 2225 } 2226 2227 public static class LocaleStringComparator implements Comparator<String> { 2228 LanguageTagParser ltp0 = new LanguageTagParser(); 2229 LanguageTagParser ltp1 = new LanguageTagParser(); 2230 2231 @Override compare(String arg0, String arg1)2232 public int compare(String arg0, String arg1) { 2233 ltp0.set(arg0); 2234 ltp1.set(arg1); 2235 String s0 = ltp0.getLanguage(); 2236 String s1 = ltp1.getLanguage(); 2237 int result = s0.compareTo(s1); 2238 if (result != 0) { 2239 return s0.equals("und") ? 1 2240 : s1.equals("und") ? -1 2241 : result; 2242 } 2243 s0 = ltp0.getScript(); 2244 s1 = ltp1.getScript(); 2245 result = s0.compareTo(s1); 2246 if (result != 0) { 2247 return result; 2248 } 2249 s0 = ltp0.getRegion(); 2250 s1 = ltp1.getRegion(); 2251 result = s0.compareTo(s1); 2252 if (result != 0) { 2253 return result; 2254 } 2255 return arg0.compareTo(arg1); // just in case 2256 } 2257 2258 } 2259 } 2260