1 package org.unicode.cldr.unittest; 2 3 import java.io.IOException; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Collections; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.LinkedHashSet; 10 import java.util.List; 11 import java.util.Map; 12 import java.util.Map.Entry; 13 import java.util.Set; 14 import java.util.TreeMap; 15 import java.util.TreeSet; 16 import java.util.regex.Matcher; 17 18 import org.unicode.cldr.draft.ScriptMetadata; 19 import org.unicode.cldr.draft.ScriptMetadata.Info; 20 import org.unicode.cldr.tool.GenerateMaximalLocales; 21 import org.unicode.cldr.tool.LikelySubtags; 22 import org.unicode.cldr.util.Builder; 23 import org.unicode.cldr.util.CLDRConfig; 24 import org.unicode.cldr.util.CLDRFile; 25 import org.unicode.cldr.util.CLDRLocale; 26 import org.unicode.cldr.util.ChainedMap; 27 import org.unicode.cldr.util.ChainedMap.M3; 28 import org.unicode.cldr.util.CldrUtility; 29 import org.unicode.cldr.util.LanguageTagParser; 30 import org.unicode.cldr.util.LocaleIDParser; 31 import org.unicode.cldr.util.PatternCache; 32 import org.unicode.cldr.util.StandardCodes; 33 import org.unicode.cldr.util.SupplementalDataInfo; 34 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 35 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 36 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 37 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 38 import org.unicode.cldr.util.XPathParts; 39 40 import com.ibm.icu.dev.test.TestFmwk; 41 import com.ibm.icu.dev.util.CollectionUtilities; 42 import com.ibm.icu.impl.Relation; 43 import com.ibm.icu.impl.Row.R2; 44 45 public class TestInheritance extends TestFmwk { 46 47 static CLDRConfig testInfo = CLDRConfig.getInstance(); 48 49 private static boolean DEBUG = CldrUtility.getProperty("DEBUG", false); 50 51 private static Matcher pathMatcher = PatternCache.get( 52 CldrUtility.getProperty("XPATH", ".*")).matcher(""); 53 main(String[] args)54 public static void main(String[] args) throws IOException { 55 new TestInheritance().run(args); 56 } 57 58 private static final SupplementalDataInfo dataInfo = SupplementalDataInfo 59 .getInstance(); 60 private static final Set<String> defaultContents = dataInfo 61 .getDefaultContentLocales(); 62 63 private static final boolean EXPECT_EQUALITY = false; 64 65 private static Set<String> availableLocales = testInfo.getFullCldrFactory().getAvailable(); 66 TestLocalesHaveOfficial()67 public void TestLocalesHaveOfficial() { 68 // If we have a language, we have all the region locales where the 69 // language is official 70 Set<String> SKIP_TERRITORIES = new HashSet<String>(Arrays.asList("001", 71 "150")); 72 for (Entry<String, R2<List<String>, String>> s : dataInfo 73 .getLocaleAliasInfo().get("territory").entrySet()) { 74 SKIP_TERRITORIES.add(s.getKey()); 75 } 76 77 LanguageTagParser ltp = new LanguageTagParser(); 78 79 Relation<String, String> languageLocalesSeen = Relation.of( 80 new TreeMap<String, Set<String>>(), TreeSet.class); 81 82 Set<String> testOrg = testInfo.getStandardCodes() 83 .getLocaleCoverageLocales("google"); 84 ChainedMap.M4<String, OfficialStatus, String, Boolean> languageToOfficialChildren = ChainedMap 85 .of(new TreeMap<String, Object>(), 86 new TreeMap<OfficialStatus, Object>(), 87 new TreeMap<String, Object>(), Boolean.class); 88 89 // gather the data 90 91 for (String language : dataInfo 92 .getLanguagesForTerritoriesPopulationData()) { 93 for (String territory : dataInfo 94 .getTerritoriesForPopulationData(language)) { 95 if (SKIP_TERRITORIES.contains(territory)) { 96 continue; 97 } 98 PopulationData data = dataInfo 99 .getLanguageAndTerritoryPopulationData(language, 100 territory); 101 OfficialStatus status = data.getOfficialStatus(); 102 if (data.getOfficialStatus() != OfficialStatus.unknown) { 103 String locale = removeScript(language + "_" + territory); 104 String lang = removeScript(ltp.set(locale).getLanguage()); 105 languageToOfficialChildren.put(lang, status, locale, 106 Boolean.TRUE); 107 languageLocalesSeen.put(lang, locale); 108 } 109 } 110 } 111 112 // flesh it out by adding 'clean' codes. 113 // also get the child locales in cldr. 114 115 Relation<String, String> languageToChildren = Relation.of( 116 new TreeMap<String, Set<String>>(), TreeSet.class); 117 for (String locale : testInfo.getCldrFactory().getAvailable()) { 118 String lang = ltp.set(locale).getLanguage(); 119 if (SKIP_TERRITORIES.contains(ltp.getRegion())) { 120 continue; 121 } 122 lang = removeScript(lang); 123 locale = removeScript(locale); 124 125 if (!lang.equals(locale)) { 126 languageToChildren.put(lang, locale); 127 Set<String> localesSeen = languageLocalesSeen.get(lang); 128 if (localesSeen == null || !localesSeen.contains(locale)) { 129 languageToOfficialChildren.put(lang, 130 OfficialStatus.unknown, locale, Boolean.TRUE); 131 } 132 } 133 } 134 135 for (Entry<String, Set<String>> languageAndChildren : languageToChildren 136 .keyValuesSet()) { 137 String language = languageAndChildren.getKey(); 138 Set<String> children = languageAndChildren.getValue(); 139 M3<OfficialStatus, String, Boolean> officalStatusToChildren = languageToOfficialChildren 140 .get(language); 141 for (Entry<OfficialStatus, Map<String, Boolean>> entry : officalStatusToChildren) { 142 OfficialStatus status = entry.getKey(); 143 if (status != OfficialStatus.official 144 && status != OfficialStatus.de_facto_official) { 145 continue; 146 } 147 Set<String> officalChildren = entry.getValue().keySet(); 148 if (!children.containsAll(officalChildren)) { 149 Set<String> missing = new TreeSet<String>(officalChildren); 150 missing.removeAll(children); 151 String message = "Missing CLDR locales for " + status 152 + " languages: " + missing; 153 errln(message); 154 } else { 155 logln("CLDR locales " + children + " cover " + status 156 + " locales " + officalChildren); 157 } 158 159 } 160 } 161 162 if (DEBUG) { 163 Set<String> languages = new TreeSet<String>( 164 languageToChildren.keySet()); 165 languages.addAll(languageToOfficialChildren.keySet()); 166 System.out.print("\ncode\tlanguage"); 167 for (OfficialStatus status : OfficialStatus.values()) { 168 System.out.print("\tNo\t" + status); 169 } 170 System.out.println(); 171 for (String language : languages) { 172 if (!testOrg.contains(language)) { 173 continue; 174 } 175 System.out.print(language + "\t" 176 + testInfo.getEnglish().getName(language)); 177 178 M3<OfficialStatus, String, Boolean> officialChildren = languageToOfficialChildren 179 .get(language); 180 for (OfficialStatus status : OfficialStatus.values()) { 181 Map<String, Boolean> children = officialChildren 182 .get(status); 183 if (children == null) { 184 System.out.print("\t" + 0 + "\t"); 185 } else { 186 System.out.print("\t" + children.size() + "\t" 187 + show(children.keySet(), false)); 188 } 189 } 190 System.out.println(); 191 } 192 } 193 } 194 show(Set<String> joint, boolean showStatus)195 private String show(Set<String> joint, boolean showStatus) { 196 StringBuffer b = new StringBuffer(); 197 for (String s : joint) { 198 if (b.length() != 0) { 199 b.append(", "); 200 } 201 LanguageTagParser ltp = new LanguageTagParser().set(s); 202 String script = ltp.getScript(); 203 if (script.length() != 0) { 204 b.append(testInfo.getEnglish().getName(CLDRFile.SCRIPT_NAME, 205 script)); 206 } 207 String region = ltp.getRegion(); 208 if (region.length() != 0) { 209 if (script.length() != 0) { 210 b.append("-"); 211 } 212 b.append(testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, 213 region)); 214 } 215 b.append(" [").append(s); 216 if (showStatus) { 217 PopulationData data = dataInfo 218 .getLanguageAndTerritoryPopulationData( 219 ltp.getLanguage(), region); 220 if (data == null) { 221 data = dataInfo.getLanguageAndTerritoryPopulationData( 222 ltp.getLanguageScript(), region); 223 } 224 b.append("; "); 225 b.append(data == null ? "?" : data.getOfficialStatus()); 226 } 227 b.append("]"); 228 229 } 230 return b.toString(); 231 } 232 removeScript(String lang)233 private String removeScript(String lang) { 234 if (!lang.contains("_")) { 235 return lang; 236 } 237 LanguageTagParser ltp = new LanguageTagParser().set(lang); 238 // String ls = ltp.getLanguageScript(); 239 // if (defaultContents.contains(ls)) { 240 ltp.setScript(""); 241 // } 242 return ltp.toString(); 243 } 244 TestLikelyAndDefaultConsistency()245 public void TestLikelyAndDefaultConsistency() { 246 LikelySubtags likelySubtags = new LikelySubtags(); 247 LanguageTagParser ltp = new LanguageTagParser(); 248 // find multiscript locales 249 Relation<String, String> base2scripts = Relation.of( 250 new TreeMap<String, Set<String>>(), TreeSet.class); 251 Map<String, String> parent2default = new TreeMap<String, String>(); 252 Map<String, String> default2parent = new TreeMap<String, String>(); 253 Relation<String, String> base2locales = Relation.of( 254 new TreeMap<String, Set<String>>(), TreeSet.class); 255 256 Set<String> knownMultiScriptLanguages = new HashSet<String>(Arrays.asList("bm", "ha")); 257 // get multiscript locales 258 for (String localeID : availableLocales) { 259 String script = ltp.set(localeID).getScript(); 260 final String base = ltp.getLanguage(); 261 if (!availableLocales.contains(base)) { 262 errln("Missing base locale for: " + localeID); 263 } 264 base2locales.put(base, localeID); 265 if (!script.isEmpty() && !base.equals("en")) { // HACK for en 266 base2scripts.put(base, script); 267 } 268 if (script.isEmpty() && knownMultiScriptLanguages.contains(base)) { 269 base2scripts.put(base, dataInfo.getDefaultScript(base)); 270 } 271 } 272 273 // get default contents 274 for (String localeID : defaultContents) { 275 checkLocale(localeID, false); 276 String simpleParent = LocaleIDParser.getSimpleParent(localeID); 277 parent2default.put(simpleParent, localeID); 278 default2parent.put(localeID, simpleParent); 279 // if (!available.contains(simpleParent)) { 280 // // verify that base language has locale in CLDR (we don't want 281 // others) 282 // errln("Default contents contains locale not in CLDR:\t" + 283 // simpleParent); 284 // } 285 } 286 287 // get likely 288 Map<String, String> likely2Maximized = likelySubtags.getToMaximized(); 289 for (Entry<String, String> likelyAndMaximized : likely2Maximized 290 .entrySet()) { 291 checkLocale(likelyAndMaximized.getKey(), true); 292 checkLocale(likelyAndMaximized.getValue(), true); 293 } 294 Map<String, String> exceptionDcLikely = new HashMap<String, String>(); 295 Map<String, String> exceptionLikelyDc = new HashMap<String, String>(); 296 for (String[] s : new String[][] { { "ar_001", "ar_Arab_EG" }, }) { 297 exceptionDcLikely.put(s[0], s[1]); 298 exceptionLikelyDc.put(s[1], s[0]); 299 } 300 301 verifyDefaultContentsImplicationsForLikelySubtags(ltp, parent2default, 302 likely2Maximized, exceptionDcLikely); 303 304 verifyLikelySubtagsImplicationsForDefaultContents(ltp, base2scripts, 305 parent2default, likely2Maximized, exceptionLikelyDc); 306 307 verifyScriptsWithDefaultContents(ltp, base2scripts, parent2default, 308 base2locales); 309 } 310 TestParentLocaleRelationships()311 public void TestParentLocaleRelationships() { 312 // Testing invariant relationships between locales - See 313 // http://unicode.org/cldr/trac/ticket/5758 314 Matcher langScript = PatternCache.get("^[a-z]{2,3}_[A-Z][a-z]{3}$") 315 .matcher(""); 316 for (String loc : availableLocales) { 317 if (langScript.reset(loc).matches()) { 318 String expectedParent = loc.split("_")[0]; 319 if (!defaultContents.contains(loc)) { 320 expectedParent = "root"; 321 } 322 String actualParent = dataInfo.getExplicitParentLocale(loc); 323 if (actualParent == null) { 324 actualParent = loc.split("_")[0]; 325 } 326 if (!actualParent.equals(expectedParent)) { 327 errln("Unexpected parent locale for locale " + loc 328 + ". Expected: " + expectedParent + " Got: " 329 + actualParent); 330 } 331 332 if (dataInfo.getExplicitParentLocale(loc) != null 333 && defaultContents.contains(loc)) { 334 errln("Locale " 335 + loc 336 + " can't have an explicit parent AND be a default content locale"); 337 } 338 } 339 } 340 } 341 TestParentLocaleInvariants()342 public void TestParentLocaleInvariants() { 343 // Testing invariant relationships in parent locales - See 344 // http://unicode.org/cldr/trac/ticket/7887 345 LocaleIDParser lp = new LocaleIDParser(); 346 for (String loc : availableLocales) { 347 String parentLocale = dataInfo.getExplicitParentLocale(loc); 348 if (parentLocale != null) { 349 if (!"root".equals(parentLocale) 350 && !lp.set(loc).getLanguage() 351 .equals(lp.set(parentLocale).getLanguage())) { 352 errln("Parent locale [" + parentLocale + "] for locale [" 353 + loc + "] cannot be a different language code."); 354 } 355 if (!"root".equals(parentLocale) 356 && !lp.set(loc).getScript() 357 .equals(lp.set(parentLocale).getScript())) { 358 errln("Parent locale [" + parentLocale + "] for locale [" 359 + loc + "] cannot be a different script code."); 360 } 361 lp.set(loc); 362 if (lp.getScript().length() == 0 && lp.getRegion().length() == 0) { 363 errln("Base language locale [" + loc + "] cannot have an explicit parent."); 364 } 365 366 } 367 } 368 } 369 TestParentLocalesForCycles()370 public void TestParentLocalesForCycles() { 371 // Testing for cyclic relationships in parent locales - See 372 // http://unicode.org/cldr/trac/ticket/7887 373 for (String loc : availableLocales) { 374 String currentLoc = loc; 375 boolean foundError = false; 376 List<String> inheritanceChain = new ArrayList<String>(Arrays.asList(loc)); 377 while (currentLoc != null && !foundError) { 378 currentLoc = LocaleIDParser.getParent(currentLoc); 379 if (inheritanceChain.contains(currentLoc)) { 380 foundError = true; 381 inheritanceChain.add(currentLoc); 382 errln("Inheritance chain for locale [" + loc + "] contains a cyclic relationship. " + inheritanceChain.toString()); 383 } 384 inheritanceChain.add(currentLoc); 385 } 386 } 387 } 388 verifyScriptsWithDefaultContents(LanguageTagParser ltp, Relation<String, String> base2scripts, Map<String, String> parent2default, Relation<String, String> base2locales)389 private void verifyScriptsWithDefaultContents(LanguageTagParser ltp, 390 Relation<String, String> base2scripts, 391 Map<String, String> parent2default, 392 Relation<String, String> base2locales) { 393 Set<String> skip = Builder.with(new HashSet<String>()) 394 .addAll("root", "und") 395 .freeze(); 396 Set<String> languagesWithOneOrLessLocaleScriptInCommon = new HashSet<String>(Arrays.asList("bm", "ha", "ms", "iu", "mn")); 397 // for each base we have to have, 398 // if multiscript, we have default contents for base+script, 399 // base+script+region; 400 // otherwise base+region. 401 for (String base : base2locales.keySet()) { 402 if (skip.contains(base)) { 403 continue; 404 } 405 String defaultContent = parent2default.get(base); 406 // Set<String> likely = base2likely.get(base); 407 // if (likely == null) { 408 // errln("Missing likely subtags for: " + base + " " + 409 // suggestLikelySubtagFor(base)); 410 // } 411 if (defaultContent == null) { 412 errln("Missing default content for: " + base + " " 413 + suggestLikelySubtagFor(base)); 414 continue; 415 } 416 Set<String> scripts = base2scripts.get(base); 417 ltp.set(defaultContent); 418 String script = ltp.getScript(); 419 String region = ltp.getRegion(); 420 if (scripts == null || languagesWithOneOrLessLocaleScriptInCommon.contains(base)) { 421 if (!script.isEmpty()) { 422 errln("Script should be empty in default content for: " 423 + base + "," + defaultContent); 424 } 425 if (region.isEmpty()) { 426 errln("Region must not be empty in default content for: " 427 + base + "," + defaultContent); 428 } 429 } else { 430 if (script.isEmpty()) { 431 errln("Script should not be empty in default content for: " 432 + base + "," + defaultContent); 433 } 434 if (!region.isEmpty()) { 435 errln("Region should be empty in default content for: " 436 + base + "," + defaultContent); 437 } 438 String defaultContent2 = parent2default.get(defaultContent); 439 if (defaultContent2 == null) { 440 errln("Missing default content for: " + defaultContent); 441 continue; 442 } 443 ltp.set(defaultContent2); 444 region = ltp.getRegion(); 445 if (region.isEmpty()) { 446 errln("Region must not be empty in default content for: " 447 + base + "," + defaultContent); 448 } 449 } 450 } 451 } 452 verifyLikelySubtagsImplicationsForDefaultContents( LanguageTagParser ltp, Relation<String, String> base2scripts, Map<String, String> parent2default, Map<String, String> likely2Maximized, Map<String, String> exceptionLikelyDc)453 private void verifyLikelySubtagsImplicationsForDefaultContents( 454 LanguageTagParser ltp, Relation<String, String> base2scripts, 455 Map<String, String> parent2default, 456 Map<String, String> likely2Maximized, 457 Map<String, String> exceptionLikelyDc) { 458 // Now check invariants for all LikelySubtags implications for Default 459 // Contents 460 // a) suppose likely max for la_Scrp => la_Scrp_RG 461 // Then default contents la_Scrp => la_Scrp_RG 462 // b) suppose likely max for la_RG => la_Scrp_RG 463 // Then we can draw no conclusions // was default contents la_Scrp => 464 // la_Scrp_RG 465 // c) suppose likely max for la => la_Scrp_RG 466 // Then default contents la => la_Scrp && la_Scrp => la_Scrp_RG 467 // or default contents la => la_RG && ! la_Scrp => la_Scrp_RG 468 469 TreeSet<String> additionalDefaultContents = new TreeSet<String>(); 470 471 for (Entry<String, String> entry : likely2Maximized.entrySet()) { 472 String source = entry.getKey(); 473 String likelyMax = entry.getValue(); 474 String sourceLang = ltp.set(source).getLanguage(); 475 if (sourceLang.equals("und") || source.equals("zh_Hani") 476 || source.equals("tl")) { 477 continue; 478 } 479 String sourceScript = ltp.getScript(); 480 String sourceRegion = ltp.getRegion(); 481 482 String likelyMaxLang = ltp.set(likelyMax).getLanguage(); 483 String likelyMaxScript = ltp.getScript(); 484 String likelyMaxRegion = ltp.getRegion(); 485 486 String dc = parent2default.get(source); 487 String possibleException = exceptionLikelyDc.get(likelyMax); 488 if (possibleException != null && possibleException.equals(dc)) { 489 continue; 490 } 491 String likelyLangScript = likelyMaxLang + "_" + likelyMaxScript; 492 String dcFromLangScript = parent2default.get(likelyLangScript); 493 494 boolean consistent = true; 495 String caseNumber = null; 496 if (consistent) { 497 if (!sourceScript.isEmpty()) { 498 caseNumber = "a"; 499 if (dc == null) { 500 if (EXPECT_EQUALITY) { 501 String expected = likelyMax; 502 errln("Default contents null for " + source 503 + ", expected:\t" + expected); 504 additionalDefaultContents.add(expected); 505 } 506 continue; 507 } 508 consistent = likelyMax.equals(dc); 509 } else if (!sourceRegion.isEmpty()) { // a 510 caseNumber = "b"; 511 // consistent = likelyMax.equals(dcFromLangScript); 512 } else { // c 513 caseNumber = "c"; 514 if (dc == null) { 515 if (EXPECT_EQUALITY) { 516 String expected = base2scripts.get(source) == null ? likelyMaxLang 517 + "_" + likelyMaxRegion 518 : likelyMaxLang + "_" + likelyMaxScript; 519 errln("Default contents null for " + source 520 + ", expected:\t" + expected); 521 additionalDefaultContents.add(expected); 522 } 523 continue; 524 } 525 String dcScript = ltp.set(dc).getScript(); 526 consistent = likelyLangScript.equals(dc) 527 && likelyMax.equals(dcFromLangScript) 528 || dcScript.isEmpty() 529 && !likelyMax.equals(dcFromLangScript); 530 // || dcScript.isEmpty() && dcRegion.equals(likelyMaxRegion) 531 // && dcFromLangScript == null; 532 } 533 } 534 if (!consistent) { 535 errln("default contents inconsistent with likely subtag: (" 536 + caseNumber + ")" + "\n\t" + source + " => (ls) " 537 + likelyMax + "\n\t" + source + " => (dc) " + dc 538 + "\n\t" + likelyLangScript + " => (dc) " 539 + dcFromLangScript); 540 } 541 } 542 if (additionalDefaultContents.size() != 0) { 543 errln("Suggested additions to supplementalMetadata/../defaultContent:\n" 544 + CollectionUtilities.join(additionalDefaultContents, " ")); 545 } 546 } 547 verifyDefaultContentsImplicationsForLikelySubtags( LanguageTagParser ltp, Map<String, String> parent2default, Map<String, String> likely2Maximized, Map<String, String> exceptionDcLikely)548 private void verifyDefaultContentsImplicationsForLikelySubtags( 549 LanguageTagParser ltp, Map<String, String> parent2default, 550 Map<String, String> likely2Maximized, 551 Map<String, String> exceptionDcLikely) { 552 // Now check invariants for all Default Contents implications for 553 // LikelySubtags 554 // a) suppose default contents la => la_Scrp. 555 // Then the likely contents for la => la_Scrp_* 556 // b) suppose default contents la => la_RG. 557 // Then the likely contents for la => la_*_RG 558 // c) suppose default contents la_Scrp => la_Scrp_RG. 559 // Then the likely contents of la_Scrp => la_Scrp_RG OR likely contents 560 // for la => la_*_* 561 for (Entry<String, String> parentAndDefault : parent2default.entrySet()) { 562 String source = parentAndDefault.getKey(); 563 String dc = parentAndDefault.getValue(); 564 String likelyMax = likely2Maximized.get(source); 565 566 // skip special exceptions 567 String possibleException = exceptionDcLikely.get(dc); 568 if (possibleException != null 569 && possibleException.equals(likelyMax)) { 570 continue; 571 } 572 573 String sourceLang = ltp.set(source).getLanguage(); 574 String sourceScript = ltp.getScript(); 575 // there cannot be a sourceRegion 576 577 String dcScript = ltp.set(dc).getScript(); 578 String dcRegion = ltp.getRegion(); 579 580 String likelyMaxLang = "", likelyMaxScript = "", likelyMaxRegion = ""; 581 if (likelyMax != null) { 582 likelyMaxLang = ltp.set(likelyMax).getLanguage(); 583 likelyMaxScript = ltp.getScript(); 584 likelyMaxRegion = ltp.getRegion(); 585 } 586 587 String likelyMax2 = likely2Maximized.get(sourceLang); 588 589 boolean consistent = true; 590 591 if (sourceScript.isEmpty()) { // a or b 592 if (!dcScript.isEmpty()) { // a 593 consistent = likelyMaxLang.equals(source) 594 && likelyMaxScript.equals(dcScript); 595 } else { // b 596 consistent = likelyMaxLang.equals(source) 597 && likelyMaxRegion.equals(dcRegion); 598 } 599 } else { // c 600 consistent = dc.equals(likelyMax) || likelyMax2 != null; 601 } 602 if (!consistent) { 603 errln("likely subtag inconsistent with default contents: " 604 + "\n\t" 605 + source 606 + " =>( dc) " 607 + dc 608 + "\n\t" 609 + source 610 + " => (ls) " 611 + likelyMax 612 + (source.equals(sourceLang) ? "" : "\n\t" + sourceLang 613 + " => (ls) " + likelyMax2)); 614 } 615 } 616 } 617 618 /** 619 * Suggest a likely subtag 620 * 621 * @param base 622 * @return 623 */ suggestLikelySubtagFor(String base)624 static String suggestLikelySubtagFor(String base) { 625 SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 626 627 CLDRLocale loc = CLDRLocale.getInstance(base); 628 629 if (!loc.getLanguage().equals(base)) { 630 return " (no suggestion- not a simple language locale)"; // no 631 // suggestion 632 // unless 633 // just 634 // a 635 // language 636 // locale. 637 } 638 Set<BasicLanguageData> basicData = sdi.getBasicLanguageData(base); 639 640 for (BasicLanguageData bld : basicData) { 641 if (bld.getType() == org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type.primary) { 642 Set<String> scripts = bld.getScripts(); 643 Set<String> territories = bld.getTerritories(); 644 645 if (scripts.size() == 1) { 646 if (territories.size() == 1) { 647 return createSuggestion( 648 loc, 649 CLDRLocale.getInstance(base + "_" 650 + scripts.iterator().next() + "_" 651 + territories.iterator().next())); 652 } 653 } 654 return "(no suggestion - multiple scripts or territories)"; 655 } 656 } 657 return ("(no suggestion- no data)"); 658 } 659 660 /** 661 * Format and return a suggested likelysubtag 662 */ createSuggestion(CLDRLocale loc, CLDRLocale toLoc)663 private static String createSuggestion(CLDRLocale loc, CLDRLocale toLoc) { 664 return " Suggest this to likelySubtags.xml: <likelySubtag from=\"" 665 + loc 666 + "\" to=\"" 667 + toLoc 668 + "\"/>\n" 669 + " <!--{ " 670 + loc.getDisplayName() 671 + "; ?; ? } => { " 672 + loc.getDisplayName() 673 + "; " 674 + toLoc.toULocale().getDisplayScript() 675 + "; " 676 + toLoc.toULocale().getDisplayCountry() + " }-->"; 677 678 } 679 TestDeprecatedTerritoryDataLocaleIds()680 public void TestDeprecatedTerritoryDataLocaleIds() { 681 HashSet<String> checked = new HashSet<String>(); 682 for (String language : dataInfo 683 .getLanguagesForTerritoriesPopulationData()) { 684 checkLocale(language, false); // checks la_Scrp and la 685 for (String region : dataInfo 686 .getTerritoriesForPopulationData(language)) { 687 if (!checked.contains(region)) { 688 checkValidCode(language + "_" + region, "territory", 689 region, false); 690 checked.add(region); 691 } 692 } 693 } 694 for (String language : dataInfo.getBasicLanguageDataLanguages()) { 695 checkLocale(language, false); // checks la_Scrp and la 696 Set<BasicLanguageData> data = dataInfo 697 .getBasicLanguageData(language); 698 for (BasicLanguageData datum : data) { 699 for (String script : datum.getScripts()) { 700 checkValidCode(language + "_" + script, "script", script, 701 false); 702 checked.add(script); 703 } 704 for (String region : datum.getTerritories()) { 705 checkValidCode(language + "_" + region, "territory", 706 region, false); 707 checked.add(region); 708 } 709 } 710 } 711 712 } 713 TestBasicLanguageDataAgainstScriptMetadata()714 public void TestBasicLanguageDataAgainstScriptMetadata() { 715 // the invariants are: 716 // if there is primary data, the script must be there 717 // otherwise it must be in the secondary 718 main: for (String script : ScriptMetadata.getScripts()) { 719 Info info = ScriptMetadata.getInfo(script); 720 String language = info.likelyLanguage; 721 if (language.equals("und")) { 722 continue; 723 } 724 Map<Type, BasicLanguageData> data = dataInfo 725 .getBasicLanguageDataMap(language); 726 if (data == null) { 727 logln("Warning: ScriptMetadata has " + language + " for " 728 + script + "," + " but " + language 729 + " is missing in language_script.txt"); 730 continue; 731 } 732 for (BasicLanguageData entry : data.values()) { 733 if (entry.getScripts().contains(script)) { 734 continue main; 735 } 736 continue; 737 } 738 logln("Warning: ScriptMetadata has " + language + " for " + script 739 + "," + " but " + language + " doesn't have " + script 740 + " in language_script.txt"); 741 } 742 } 743 TestCldrFileConsistency()744 public void TestCldrFileConsistency() { 745 boolean haveErrors = false; 746 for (String locale : testInfo.getCldrFactory().getAvailable()) { 747 CLDRFile cldrFileToCheck = testInfo.getCLDRFile(locale, 748 false); 749 int errors = 0; 750 for (String path : cldrFileToCheck) { 751 if (!pathMatcher.reset(path).find()) { 752 continue; 753 } 754 String fullPath = cldrFileToCheck.getFullXPath(path); 755 if (fullPath == null) { 756 // try again, for debugging 757 fullPath = cldrFileToCheck.getFullXPath(path); 758 String value = cldrFileToCheck.getStringValue(path); 759 if (DEBUG) { 760 errln("Invalid full path\t" + locale + ", " + path 761 + ", " + fullPath + ", " + value); 762 } 763 errors++; 764 haveErrors = true; 765 } 766 } 767 if (errors != 0) { 768 errln(locale 769 + (errors != 0 ? "\tinvalid getFullXPath() values:" 770 + errors : "")); 771 } else { 772 logln(locale); 773 } 774 } 775 if (haveErrors && !DEBUG) { 776 errln("Use -DDEBUG to see details"); 777 } 778 } 779 780 static SupplementalDataInfo info = SupplementalDataInfo.getInstance(); 781 LanguageTagParser ltp = new LanguageTagParser(); 782 783 // public void TestAliases() { 784 // Factory factory = Factory.make(CldrUtility.MAIN_DIRECTORY, fileMatcher); 785 // Set<String> allLocales = Factory.make(CldrUtility.MAIN_DIRECTORY, 786 // ".*").getAvailable(); 787 // 788 // LanguageTagCanonicalizer languageTagCanonicalizer = new 789 // LanguageTagCanonicalizer(); 790 // 791 // Set<String> defaultContents = info.getDefaultContentLocales(); 792 // 793 // Map<String, String> likelySubtags = info.getLikelySubtags(); 794 // 795 // XPathParts xpp = new XPathParts(); 796 // 797 // // get the top level aliases, and verify that they are consistent with 798 // // maximization 799 // Map<String, String> topLevelAliases = new TreeMap<String, String>(); 800 // Set<String> crossScriptSet = new TreeSet<String>(); 801 // Set<String> aliasPaths = new TreeSet<String>(); 802 // Set<String> locales = factory.getAvailable(); 803 // 804 // // get the languages that need scripts 805 // // TODO broaden to beyond CLDR 806 // Set<String> needScripts = new TreeSet<String>(); 807 // for (String locale : locales) { 808 // String script = ltp.set(locale).getScript(); 809 // if (script.length() != 0) { 810 // needScripts.add(ltp.getLanguage()); 811 // } 812 // } 813 // 814 // logln("Languages that have scripts:\t" + needScripts); 815 // 816 // for (String locale : locales) { 817 // 818 // // get alias locale 819 // String aliasLocale = locale; 820 // String explicitAlias = null; 821 // String aliasPathNew = null; 822 // CLDRFile cldrFileToCheck = factory.make(locale, false); 823 // aliasPaths.clear(); 824 // // examples: 825 // // in: <alias source="id" path="//ldml"/> 826 // // ar_IR: <alias source="az_Arab_IR" path="//ldml"/> 827 // 828 // cldrFileToCheck.getPaths("//ldml/alias", null, aliasPaths); 829 // if (aliasPaths.size() != 0) { 830 // String aliasPath = aliasPaths.iterator().next(); 831 // String fullPath = cldrFileToCheck.getFullXPath(aliasPath); 832 // explicitAlias = aliasLocale = xpp.set(fullPath).getAttributeValue(1, 833 // "source"); 834 // String aliasParent = LocaleIDParser.getParent(aliasLocale); 835 // if (!aliasParent.equals("root")) { 836 // topLevelAliases.put(locale, aliasParent); 837 // } 838 // aliasPathNew = xpp.set(fullPath).getAttributeValue(1, "path"); 839 // if ("//ldml/".equals(aliasPathNew)) { 840 // errln("Bad alias path:\t" + fullPath); 841 // } 842 // } 843 // 844 // checkAliasValues(cldrFileToCheck, allLocales); 845 // 846 // // get canonicalized 847 // String canonicalizedLocale = languageTagCanonicalizer.transform(locale); 848 // if (!locale.equals(canonicalizedLocale)) { 849 // logln("Locale\t" + locale + " => " + canonicalizedLocale); 850 // } 851 // 852 // String base = ltp.set(canonicalizedLocale).getLanguage(); 853 // String script = ltp.getScript(); 854 // if (canonicalizedLocale.equals(base)) { // eg, id, az 855 // continue; 856 // } 857 // 858 // // see if the locale's default script is the same as the base locale's 859 // 860 // String maximized = maximize(likelySubtags, canonicalizedLocale); 861 // if (maximized == null) { 862 // errln("Missing likely subtags for:\t" + locale + " " + 863 // suggestLikelySubtagFor(locale)); 864 // continue; 865 // } 866 // String maximizedScript = ltp.set(maximized).getScript(); 867 // 868 // String minimized = minimize(likelySubtags, canonicalizedLocale); 869 // 870 // String baseMaximized = maximize(likelySubtags, base); 871 // String baseScript = ltp.set(baseMaximized).getScript(); 872 // 873 // if (script.length() != 0 && !script.equals(baseScript)) { 874 // crossScriptSet.add(ltp.set(locale).getLanguageScript()); 875 // } 876 // 877 // // Finally, put together the expected alias for comparison. 878 // // It is the "best" alias, in that the default-content locales are 879 // skipped in favor of their parents 880 // 881 // String expectedAlias = 882 // !baseScript.equals(maximizedScript) ? minimized : 883 // !locale.equals(canonicalizedLocale) ? canonicalizedLocale : 884 // // needScripts.contains(base) ? ltp.getLanguageScript() : 885 // locale; 886 // 887 // if (!equals(aliasLocale, expectedAlias)) { 888 // String aliasMaximized = maximize(likelySubtags, aliasLocale); 889 // String expectedMaximized = maximize(likelySubtags, expectedAlias); 890 // if (!equals(aliasMaximized, expectedMaximized)) { 891 // errln("For locale:\t" + locale 892 // + ",\tbase-script:\t" + baseScript 893 // + ",\texpected alias Locale != actual alias Locale:\t" 894 // + expectedAlias + ", " + aliasLocale); 895 // } else if (explicitAlias == null) { 896 // // skip, we don't care in this case 897 // // but we emit warnings if the other conditions are true. The aliasing 898 // could be simpler. 899 // } else if (equals(expectedAlias, locale)) { 900 // logln("Warning; alias could be omitted. For locale:\t" + locale 901 // + ",\tbase-script:\t" + baseScript 902 // + ",\texpected alias Locale != actual alias Locale:\t" 903 // + expectedAlias + ", " + aliasLocale); 904 // } else { 905 // logln("Warning; alias could be minimized. For locale:\t" + locale 906 // + ",\tbase-script:\t" + baseScript 907 // + ",\texpected alias Locale != actual alias Locale:\t" 908 // + expectedAlias + ", " + aliasLocale); 909 // } 910 // } 911 // } 912 // 913 // // check the LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES value and make sure 914 // it matches what is in the files in main/ 915 // 916 // if (!topLevelAliases.equals(LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES) 917 // && locales.equals(allLocales)) { 918 // String diff = showDifferences(LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES, 919 // topLevelAliases); 920 // if (!diff.isEmpty()) { 921 // errln("LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES ≠ topLevelAliases: " + 922 // diff); 923 // } 924 // StringBuilder result = new StringBuilder( 925 // "Suggest changing LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES to:\n"); 926 // for (Entry<String, String> entry : topLevelAliases.entrySet()) { 927 // result.append("\t.put(\"") 928 // .append(entry.getKey()) 929 // .append("\", \"") 930 // .append(entry.getValue()) 931 // .append("\")\n"); 932 // } 933 // errln(result.toString()); 934 // } else { 935 // logln("Top Level Aliases:\t" + topLevelAliases); 936 // } 937 // 938 // // verify that they are the same as what we would get if we were to 939 // maximize 940 // // all the locales and check against default_contents 941 // 942 // for (String locale : defaultContents) { 943 // CLDRFile cldrFileToCheck = null; 944 // try { 945 // cldrFileToCheck = factory.make(locale, false); 946 // } catch (Exception e) {} 947 // if (cldrFileToCheck == null) { 948 // logln("Present in default contents but has no XML file:\t" + locale); 949 // continue; 950 // } 951 // logln("Locale:\t" + locale); 952 // // verify empty, except for identity elements and alias 953 // for (String path : cldrFileToCheck) { 954 // if (path.contains("/identity/")) { 955 // continue; 956 // } 957 // errln("Default content locale not empty:\t" + locale + ", " + path); 958 // break; 959 // } 960 // } 961 // } 962 963 Matcher aliasMatcher = PatternCache.get("//ldml.*/alias.*").matcher(""); 964 checkAliasValues(CLDRFile cldrFileToCheck, Set<String> locales)965 private void checkAliasValues(CLDRFile cldrFileToCheck, Set<String> locales) { 966 Set<String> aliasPaths = new TreeSet<String>(); 967 Set<String> allAliasPaths = cldrFileToCheck.getPaths("//ldml/", 968 aliasMatcher, aliasPaths); 969 XPathParts xpp = new XPathParts(); 970 for (String aliasPath : allAliasPaths) { 971 if (aliasPath.startsWith("//ldml/alias")) { 972 continue; // we have different tests elsewhere 973 } 974 String fullPath = cldrFileToCheck.getFullXPath(aliasPath); 975 String aliasLocale = xpp.set(fullPath).getAttributeValue(-1, 976 "source"); 977 // just check to make sure that the alias is in the locales 978 if (aliasLocale != null && !aliasLocale.equals("locale")) { 979 if (!locales.contains(aliasLocale)) { 980 errln("Unknown Alias:\t" + aliasLocale + "\t in\t" 981 + fullPath); 982 } 983 } 984 String aliasPathNew = xpp.set(fullPath).getAttributeValue(-1, 985 "path"); 986 // just one check 987 if (".".equals(aliasPathNew)) { 988 errln("Illegal path, must not be .:\t" + aliasLocale 989 + "\t in\t" + fullPath); 990 } 991 992 } 993 } 994 minimize(Map<String, String> likelySubtags, String locale)995 private String minimize(Map<String, String> likelySubtags, String locale) { 996 String result = GenerateMaximalLocales.minimize(locale, likelySubtags, 997 false); 998 if (result == null) { 999 LanguageTagParser ltp3 = new LanguageTagParser().set(locale); 1000 List<String> variants = ltp3.getVariants(); 1001 Map<String, String> extensions = ltp3.getExtensions(); 1002 Set<String> emptySet = Collections.emptySet(); 1003 ltp3.setVariants(emptySet); 1004 Map<String, String> emptyMap = Collections.emptyMap(); 1005 ltp3.setExtensions(emptyMap); 1006 String newLocale = ltp3.toString(); 1007 result = GenerateMaximalLocales.minimize(newLocale, likelySubtags, 1008 false); 1009 if (result != null) { 1010 ltp3.set(result); 1011 ltp3.setVariants(variants); 1012 ltp3.setExtensions(extensions); 1013 result = ltp3.toString(); 1014 } 1015 } 1016 return result; 1017 } 1018 maximize(Map<String, String> likelySubtags, String locale)1019 private String maximize(Map<String, String> likelySubtags, String locale) { 1020 String result = GenerateMaximalLocales.maximize(locale, likelySubtags); 1021 if (result == null) { 1022 LanguageTagParser ltp3 = new LanguageTagParser().set(locale); 1023 List<String> variants = ltp3.getVariants(); 1024 Map<String, String> extensions = ltp3.getExtensions(); 1025 Set<String> emptySet = Collections.emptySet(); 1026 ltp3.setVariants(emptySet); 1027 Map<String, String> emptyMap = Collections.emptyMap(); 1028 ltp3.setExtensions(emptyMap); 1029 String newLocale = ltp3.toString(); 1030 result = GenerateMaximalLocales.maximize(newLocale, likelySubtags); 1031 if (result != null) { 1032 ltp3.set(result); 1033 ltp3.setVariants(variants); 1034 ltp3.setExtensions(extensions); 1035 result = ltp3.toString(); 1036 } 1037 } 1038 return result; 1039 } 1040 1041 // TODO move this into central utilities equals(CharSequence string, int codePoint)1042 public static boolean equals(CharSequence string, int codePoint) { 1043 if (string == null) { 1044 return false; 1045 } 1046 switch (string.length()) { 1047 case 1: 1048 return codePoint == string.charAt(0); 1049 case 2: 1050 return codePoint >= 0x10000 1051 && codePoint == Character.codePointAt(string, 0); 1052 default: 1053 return false; 1054 } 1055 } 1056 1057 // TODO move this into central utilities 1058 1059 private static final StandardCodes STANDARD_CODES = testInfo.getStandardCodes(); 1060 private static final Map<String, Map<String, R2<List<String>, String>>> DEPRECATED_INFO = dataInfo 1061 .getLocaleAliasInfo(); 1062 checkLocale(String localeID, boolean allowDeprecated)1063 private void checkLocale(String localeID, boolean allowDeprecated) { 1064 // verify that the localeID is valid 1065 LanguageTagParser ltp = new LanguageTagParser().set(localeID); 1066 String language = ltp.getLanguage(); 1067 String script = ltp.getScript(); 1068 String region = ltp.getRegion(); 1069 // TODO check variants, extensions also. 1070 checkValidCode(localeID, "language", language, allowDeprecated); 1071 checkValidCode(localeID, "script", script, allowDeprecated); 1072 checkValidCode(localeID, "territory", region, allowDeprecated); 1073 } 1074 checkValidCode(String localeID, String subtagType, String subtag, boolean allowDeprecated)1075 private void checkValidCode(String localeID, String subtagType, 1076 String subtag, boolean allowDeprecated) { 1077 if (subtagType.equals("language")) { 1078 if (subtag.equals("und")) { 1079 return; 1080 } 1081 } else { 1082 if (subtag.isEmpty()) { 1083 return; 1084 } 1085 } 1086 if (!STANDARD_CODES.getAvailableCodes(subtagType).contains(subtag)) { 1087 errln("Locale " + localeID + " contains illegal " 1088 + showCode(subtagType, subtag)); 1089 } else if (!allowDeprecated) { 1090 // "language" -> "sh" -> <{"sr_Latn"}, reason> 1091 R2<List<String>, String> deprecatedInfo = DEPRECATED_INFO.get( 1092 subtagType).get(subtag); 1093 if (deprecatedInfo != null) { 1094 errln("Locale " + localeID + " contains deprecated " 1095 + showCode(subtagType, subtag) + " " 1096 + deprecatedInfo.get1() + "; suggest " 1097 + showName(deprecatedInfo.get0(), subtagType)); 1098 } 1099 } 1100 } 1101 showName(List<String> deprecatedInfo, String subtagType)1102 private String showName(List<String> deprecatedInfo, String subtagType) { 1103 StringBuilder result = new StringBuilder(); 1104 for (String s : deprecatedInfo) { 1105 result.append(showName(subtagType, s)).append(" "); 1106 } 1107 return result.toString(); 1108 } 1109 showCode(String subtagType, String subtag)1110 private String showCode(String subtagType, String subtag) { 1111 return subtagType + " code: " + showName(subtagType, subtag); 1112 } 1113 showName(String subtagType, String subtag)1114 private String showName(String subtagType, String subtag) { 1115 return subtag + " (" + getName(subtagType, subtag) + ")"; 1116 } 1117 getName(String subtagType, String subtag)1118 private String getName(String subtagType, String subtag) { 1119 Map<String, String> data = STANDARD_CODES.getLangData(subtagType, 1120 subtag); 1121 if (data == null) { 1122 return "<no name>"; 1123 } 1124 return data.get("Description"); 1125 } 1126 1127 // TODO move this into central utilities equals(int codePoint, CharSequence string)1128 public static boolean equals(int codePoint, CharSequence string) { 1129 return equals(string, codePoint); 1130 } 1131 1132 // TODO move this into central utilities equals(Object a, Object b)1133 public static boolean equals(Object a, Object b) { 1134 return a == b ? true : a == null || b == null ? false : a.equals(b); 1135 } 1136 1137 // TODO move this into central utilities showDifferences(Map<K, V> a, Map<K, V> b)1138 private <K, V> String showDifferences(Map<K, V> a, Map<K, V> b) { 1139 StringBuilder result = new StringBuilder(); 1140 Set<K> keys = new LinkedHashSet<K>(); 1141 keys.addAll(a.keySet()); 1142 keys.addAll(b.keySet()); 1143 for (K key : keys) { 1144 if (!a.containsKey(key)) { 1145 result.append(key).append("→‹").append(a.get(key)) 1146 .append("›,∅; "); 1147 } else if (!b.containsKey(key)) { 1148 result.append(key).append("→∅,‹").append(b.get(key)) 1149 .append("›; "); 1150 } else { 1151 V aKey = a.get(key); 1152 V bKey = b.get(key); 1153 if (!equals(aKey, bKey)) { 1154 result.append(key).append("→‹").append(a.get(key)) 1155 .append("›,‹").append(b.get(key)).append("›; "); 1156 } 1157 } 1158 } 1159 return result.toString(); 1160 } 1161 TestLanguageTagParser()1162 public void TestLanguageTagParser() { 1163 LanguageTagParser ltp = new LanguageTagParser(); 1164 ltp.set("en-Cyrl-US"); 1165 assertEquals(null, "en", ltp.getLanguage()); 1166 assertEquals(null, "en_Cyrl", ltp.getLanguageScript()); 1167 assertEquals(null, "Cyrl", ltp.getScript()); 1168 assertEquals(null, "US", ltp.getRegion()); 1169 try { 1170 ltp.set("$"); 1171 assertFalse("expected exception", true); 1172 } catch (Exception e) { 1173 logln(e.getMessage()); 1174 } 1175 } 1176 } 1177