1 package org.unicode.cldr.unittest; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.collect.ImmutableMap; 5 import com.google.common.collect.ImmutableSet; 6 import com.google.common.collect.ImmutableSortedSet; 7 import com.google.common.collect.Multimap; 8 import com.google.common.collect.Multimaps; 9 import com.google.common.collect.Sets; 10 import com.google.common.collect.TreeMultimap; 11 import com.ibm.icu.text.UnicodeSet; 12 import java.util.Collection; 13 import java.util.Collections; 14 import java.util.EnumSet; 15 import java.util.LinkedHashSet; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 import java.util.stream.Collectors; 22 import org.unicode.cldr.test.CoverageLevel2; 23 import org.unicode.cldr.tool.MinimizeRegex; 24 import org.unicode.cldr.util.CLDRConfig; 25 import org.unicode.cldr.util.CLDRFile; 26 import org.unicode.cldr.util.CLDRLocale; 27 import org.unicode.cldr.util.Counter; 28 import org.unicode.cldr.util.Factory; 29 import org.unicode.cldr.util.LanguageTagParser; 30 import org.unicode.cldr.util.Level; 31 import org.unicode.cldr.util.LocaleIDParser; 32 import org.unicode.cldr.util.LocaleNames; 33 import org.unicode.cldr.util.Organization; 34 import org.unicode.cldr.util.StandardCodes; 35 import org.unicode.cldr.util.StandardCodes.LstrType; 36 import org.unicode.cldr.util.SupplementalDataInfo; 37 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 38 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 39 import org.unicode.cldr.util.Validity; 40 import org.unicode.cldr.util.Validity.Status; 41 import org.unicode.cldr.util.VoteResolver; 42 43 public class TestCLDRLocaleCoverage extends TestFmwkPlus { 44 private static StandardCodes sc = StandardCodes.make(); 45 private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance(); 46 private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo(); 47 private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish(); 48 main(String[] args)49 public static void main(String[] args) { 50 new TestCLDRLocaleCoverage().run(args); 51 } 52 TestLanguageNameCoverage()53 public void TestLanguageNameCoverage() { 54 // mainLocales has the locales in common/main, which is basically the set in 55 // attributeValueValidity.xml $language.. 56 // We add in additionsToTranslate below the set in attributeValueValidity.xml 57 // $languageExceptions 58 // (both sets are included in SDI.getCLDRLanguageCodes() but we do not use that until 59 // later). 60 Set<String> additionsToTranslate = 61 ImmutableSortedSet.of( 62 LocaleNames.ZXX, 63 LocaleNames.MUL, 64 "ab", 65 "ace", 66 "ada", 67 "ady", 68 "ain", 69 "ale", 70 "alt", 71 "an", 72 "anp", 73 "arn", 74 "arp", 75 "ars", 76 "atj", 77 "av", 78 "awa", 79 "ay", 80 "ba", 81 "ban", 82 "bho", 83 "bi", 84 "bin", 85 "bla", 86 "bug", 87 "byn", 88 "cay", 89 "ch", 90 "chk", 91 "chm", 92 "cho", 93 "chp", 94 "chy", 95 "clc", 96 "co", 97 "crg", 98 "crj", 99 "crk", 100 "crl", 101 "crm", 102 "crr", 103 "csw", 104 "cv", 105 "dak", 106 "dar", 107 "dgr", 108 "dv", 109 "dzg", 110 "efi", 111 "eka", 112 "fj", 113 "fon", 114 "frc", 115 "gaa", 116 "gez", 117 "gil", 118 "gn", 119 "gor", 120 "gwi", 121 "hai", 122 "hax", 123 "hil", 124 "hmn", 125 "ht", 126 "hup", 127 "hur", 128 "hz", 129 "iba", 130 "ibb", 131 "ikt", 132 "ilo", 133 "inh", 134 "io", 135 "iu", 136 "jbo", 137 "kac", 138 "kaj", 139 "kbd", 140 "kcg", 141 "kfo", 142 "kha", 143 "kj", 144 "kmb", 145 "kpe", 146 "kr", 147 "krc", 148 "krl", 149 "kru", 150 "kum", 151 "kv", 152 "kwk", 153 "la", 154 "lad", 155 "lez", 156 "li", 157 "lil", 158 "lou", 159 "loz", 160 "lsm", 161 "lua", 162 "lun", 163 "lus", 164 "mad", 165 "mag", 166 "mak", 167 "mdf", 168 "men", 169 "mh", 170 "mic", 171 "min", 172 "moe", 173 "moh", 174 "mos", 175 "mus", 176 "mwl", 177 "myv", 178 "na", 179 "nap", 180 "new", 181 "ng", 182 "nia", 183 "niu", 184 "nog", 185 "nqo", 186 "nr", 187 "nso", 188 "nv", 189 "ny", 190 "oc", 191 "ojb", 192 "ojc", 193 "ojs", 194 "ojw", 195 "oka", 196 "pag", 197 "pam", 198 "pap", 199 "pau", 200 "pqm", 201 "rap", 202 "rar", 203 "rhg", 204 "rup", 205 "sad", 206 "sba", 207 "scn", 208 "sco", 209 "shn", 210 "slh", 211 "sm", 212 "snk", 213 "srn", 214 "ss", 215 "st", 216 "str", 217 "suk", 218 "swb", 219 "syr", 220 "tce", 221 "tem", 222 "tet", 223 "tgx", 224 "tht", 225 "tig", 226 "tlh", 227 "tli", 228 "tn", 229 "tpi", 230 "trv", 231 "ts", 232 "ttm", 233 "tum", 234 "tvl", 235 "ty", 236 "tyv", 237 "udm", 238 "umb", 239 "ve", 240 "wa", 241 "wal", 242 "war", 243 "wuu", 244 "xal", 245 "ybb", 246 "zun", 247 "zza"); 248 249 Set<String> removalsForLateBasics = 250 Set.of( 251 "blo", "eo", "ie", "kxv", "lij", "lmo", "nds", "prg", "szl", "tok", "vec", 252 "vmw", "xnr", "za"); 253 254 warnln( 255 "Locale names added for translation; revisit each release:\n" 256 + Joiner.on("\n") 257 .join( 258 additionsToTranslate.stream() 259 .map(x -> x + "\t(" + ENGLISH.getName(x) + ")") 260 .collect(Collectors.toList()))); 261 262 Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language); 263 Multimap<Status, String> statusToLang = 264 Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create()); 265 Set<String> regular = (Set<String>) statusToLang.get(Status.regular); 266 Set<String> regularPlus = 267 ImmutableSet.<String>builder() 268 .addAll(regular) 269 .add(LocaleNames.UND) 270 .add(LocaleNames.ZXX) 271 .add(LocaleNames.MUL) 272 .build(); 273 Set<String> valid = validity.keySet(); 274 275 Factory factory = CLDRCONFIG.getCldrFactory(); 276 Set<String> mainLocales = new LinkedHashSet<>(); 277 LanguageTagParser ltp = new LanguageTagParser(); 278 for (String locale : factory.getAvailableLanguages()) { 279 String language = ltp.set(locale).getLanguage(); 280 if (language.equals(LocaleNames.ROOT)) { 281 language = LocaleNames.UND; 282 } else if (!StandardCodes.isLocaleAtLeastBasic(language)) { 283 continue; 284 } 285 mainLocales.add(language); 286 } 287 mainLocales = ImmutableSet.copyOf(mainLocales); 288 Set<String> localesForNames = new TreeSet<>(); 289 localesForNames.addAll(mainLocales); 290 localesForNames.addAll(additionsToTranslate); 291 localesForNames.removeAll(removalsForLateBasics); 292 localesForNames = ImmutableSet.copyOf(localesForNames); 293 294 assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames); 295 296 CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance(LocaleNames.UND); 297 Multimap<Level, String> levelToLanguage = TreeMultimap.create(); 298 for (String locale : valid) { 299 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale); 300 Level level = coverageLeveler.getLevel(path); 301 levelToLanguage.put(level, locale); 302 } 303 304 Set<String> coverageLocales = new TreeSet<>(); 305 for (Level level : Level.values()) { 306 if (level == Level.COMPREHENSIVE) { 307 continue; 308 } 309 // assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, 310 // levelToLanguage.get(level)); 311 coverageLocales.addAll(levelToLanguage.get(level)); 312 } 313 314 // added for CLDR-15888 315 coverageLocales.add("bgc"); 316 coverageLocales.add("raj"); 317 318 // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes. 319 // Usually a problem with coverage. 320 boolean showRegex = 321 !assertContains( 322 "localesForNames.containsAll(coverageLocales)", 323 localesForNames, 324 coverageLocales); 325 showRegex |= 326 !assertContains( 327 "coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?", 328 coverageLocales, localesForNames); 329 if (showRegex || true) { 330 String simplePattern = MinimizeRegex.simplePattern(localesForNames); 331 warnln("Plain Regex for coverage:\n" + simplePattern); 332 warnln( 333 "Compact Regex for coverage:\n" 334 + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]"))); 335 } 336 337 coverageLocales.addAll(SDI.getCLDRLanguageCodes()); 338 339 Map<String, Integer> official1M = getOfficial1M(); 340 Set<String> official1MSet = new TreeSet<>(); 341 for (String locale : official1M.keySet()) { 342 if (!localesForNames.contains(locale)) { 343 official1MSet.add(locale); 344 } 345 } 346 warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet); 347 348 // assertContains("sdiLocales contains oldModernLocales", sdiLocales, 349 // oldModernLocales); 350 // assertContains("oldModernLocales contains sdiLocales", oldModernLocales, 351 // sdiLocales); 352 353 coverageLocales.removeAll(mainLocales); 354 coverageLocales.removeAll(additionsToTranslate); 355 356 for (String locale : localesForNames) { 357 logln("\n" + locale + "\t" + ENGLISH.getName(locale)); 358 } 359 360 logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder())); 361 logln( 362 "\nadditionsToTranslate:" 363 + composeList(additionsToTranslate, "\n\t", new StringBuilder())); 364 logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder())); 365 } 366 getOfficial1M()367 private Map<String, Integer> getOfficial1M() { 368 Counter<String> counter = new Counter<>(); 369 for (String region : SDI.getTerritoriesWithPopulationData()) { 370 for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) { 371 PopulationData popData = 372 SDI.getLanguageAndTerritoryPopulationData(language, region); 373 OfficialStatus status = popData.getOfficialStatus(); 374 if (status == OfficialStatus.unknown) { 375 continue; 376 } 377 // we only care about names, so drop scripts 378 int underbar = language.indexOf('_'); 379 if (underbar >= 0) { 380 language = language.substring(0, underbar); 381 } 382 counter.add(language, (int) popData.getLiteratePopulation()); 383 } 384 } 385 Map<String, Integer> result = new TreeMap<>(); 386 for (String language : counter.keySet()) { 387 long litPop = counter.get(language); 388 if (litPop >= 1_000_000) { 389 result.put(language, (int) litPop); 390 } 391 } 392 return ImmutableMap.copyOf(result); 393 } 394 composeList( Iterable<String> source, String separator, StringBuilder result)395 static final StringBuilder composeList( 396 Iterable<String> source, String separator, StringBuilder result) { 397 String prefix = null; 398 for (String item : source) { 399 if (prefix == null || !item.startsWith(prefix)) { 400 result.append(separator); 401 prefix = item.substring(0, 1); // only ascii 402 } else { 403 result.append(' '); 404 } 405 result.append(item); 406 } 407 return result; 408 } 409 assertContains( String title, Collection<String> set, Collection<String> subset)410 private boolean assertContains( 411 String title, Collection<String> set, Collection<String> subset) { 412 set = removeBelowBasic(set); 413 subset = removeBelowBasic(subset); 414 boolean result = set.containsAll(subset); 415 if (!result) { 416 Set<String> temp = new LinkedHashSet<>(subset); 417 temp.removeAll(set); 418 Set<String> temp2 = new TreeSet<>(); 419 for (String locale : temp) { 420 temp2.add(locale + "\t" + ENGLISH.getName(locale)); 421 } 422 errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2)); 423 } 424 return result; 425 } 426 removeBelowBasic(Collection<String> set)427 private Collection<String> removeBelowBasic(Collection<String> set) { 428 Collection<String> set2 = new TreeSet<>(); 429 for (String locale : set) { 430 if (StandardCodes.isLocaleAtLeastBasic(locale)) { 431 set2.add(locale); 432 } 433 } 434 return set2; 435 } 436 437 /** Test whether there are any locales for the organization CLDR */ TestCLDROrganizationPresence()438 public void TestCLDROrganizationPresence() { 439 Set<String> cldrLocales = 440 sc.getLocaleCoverageLocales(Organization.cldr, EnumSet.of(Level.MODERN)); 441 assertNotNull("Expected CLDR modern locales not to be null", cldrLocales); 442 assertTrue( 443 "Expected locales for CLDR, but found none.", 444 cldrLocales != null && !cldrLocales.isEmpty()); 445 } 446 447 /** Tests that cldr+special is a superset of the TC locales, with the right levels */ TestCldrSuperset()448 public void TestCldrSuperset() { 449 final Set<Organization> orgs = Organization.getTCOrgs(); 450 451 Map<Organization, Map<String, Level>> orgToLevels = new TreeMap<>(); 452 orgs.forEach(org -> orgToLevels.put(org, sc.getLocalesToLevelsFor(org))); 453 454 Map<String, Level> special = sc.getLocalesToLevelsFor(Organization.special); 455 456 Map<String, Level> cldr = sc.getLocalesToLevelsFor(Organization.cldr); 457 458 // check that the cldr locales (+ special) have the max level of the TC locales 459 460 for (Entry<String, Level> entry : cldr.entrySet()) { 461 final String locale = entry.getKey(); 462 463 final Map<Organization, Level> orgToLevel = 464 orgToLevels.entrySet().stream() 465 .collect( 466 Collectors.toMap( 467 Entry::getKey, 468 v -> { 469 final Level l = v.getValue().get(locale); 470 if (l == null) return Level.UNDETERMINED; 471 return l; 472 })); 473 474 Level cldrLevel = entry.getValue(); 475 Level specialLevel = special.get(locale); 476 boolean cldrLevelIsModern = cldrLevel.compareTo(Level.MODERN) >= 0; 477 478 // check the vote count 479 480 final int count = 481 (int) 482 orgToLevel.values().stream() 483 .filter(TestCLDRLocaleCoverage::isPresentAndAtLeastModern) 484 .count(); 485 final int countMin = 2; 486 final boolean countAtLeast = count > countMin; 487 int defaultVotes = 488 SupplementalDataInfo.getInstance() 489 .getRequiredVotes(CLDRLocale.getInstance(locale), null); 490 491 if (countAtLeast && cldrLevelIsModern) { 492 assertEquals( 493 "orgCount=" 494 + count 495 + ", and cldrLevel=" 496 + cldrLevel 497 + ", expected LOWER_BAR but it wasn't for " 498 + locale, 499 VoteResolver.LOWER_BAR, 500 defaultVotes); 501 } else { 502 assertNotEquals( 503 "orgCount=" 504 + count 505 + ", and cldrLevel=" 506 + cldrLevel 507 + ", expected " 508 + locale 509 + " to NOT have LOWER_BAR", 510 VoteResolver.LOWER_BAR, 511 defaultVotes); 512 } 513 514 // check the max level 515 Level maxLevel = 516 Level.max(specialLevel, Level.max(orgToLevel.values().toArray(new Level[0]))); 517 assertEquals( 518 "cldr level = max for " + locale + " (" + ENGLISH.getName(locale) + ")", 519 cldrLevel, 520 maxLevel); 521 } 522 523 // check that the cldr locales include all of the other locale's 524 orgToLevels 525 .entrySet() 526 .forEach( 527 e -> { 528 final Organization org = e.getKey(); 529 final Map<String, Level> l = e.getValue(); 530 checkCldrContains("cldr", cldr, org.name(), l); 531 checkCldrContains("cldr", cldr, "special", l); 532 // check that special doesn't overlap with TC, except for locales in 533 // LOCALE_CONTAINMENT_EXCEPTIONS 534 checkDisjoint("special", special, org.name(), l); 535 }); 536 } 537 isPresentAndAtLeastModern(Level orgLevel)538 private static boolean isPresentAndAtLeastModern(Level orgLevel) { 539 return orgLevel == Level.UNDETERMINED 540 ? false 541 : orgLevel.compareTo(Level.MODERN) >= 0 ? true : false; 542 } 543 544 private static final Set<String> ANY_LOCALE_SET = ImmutableSet.of("*"); 545 private static final Set<String> LOCALE_CONTAINMENT_EXCEPTIONS = 546 ImmutableSet.of( 547 "sr_Latn", // auto-generated 548 "hi", 549 "sr", 550 "yue", // these are inserted by Locales.txt processing TODO don't add to special 551 "to", 552 "qu" // optional locales 553 ); 554 checkCldrContains( String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)555 private void checkCldrContains( 556 String firstName, 557 Map<String, Level> first, 558 String otherName, 559 Map<String, Level> other) { 560 assertEquals( 561 firstName + " ⊇ " + otherName, 562 Collections.emptySet(), 563 Sets.difference(Sets.difference(other.keySet(), ANY_LOCALE_SET), first.keySet())); 564 } 565 checkDisjoint( String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)566 private void checkDisjoint( 567 String firstName, 568 Map<String, Level> first, 569 String otherName, 570 Map<String, Level> other) { 571 assertEquals( 572 firstName + " ⩃ " + otherName, 573 Collections.emptySet(), 574 Sets.difference( 575 Sets.intersection(other.keySet(), first.keySet()), 576 LOCALE_CONTAINMENT_EXCEPTIONS)); 577 } 578 TestParentCoverage()579 public void TestParentCoverage() { 580 for (Organization organization : sc.getLocaleCoverageOrganizations()) { 581 if (organization == Organization.special) { 582 continue; 583 } 584 final Map<String, Level> localesToLevels = sc.getLocalesToLevelsFor(organization); 585 for (Entry<String, Level> localeAndLevel : localesToLevels.entrySet()) { 586 String originalLevel = localeAndLevel.getKey(); 587 Level level = localeAndLevel.getValue(); 588 String locale = originalLevel; 589 while (true) { 590 String parent = LocaleIDParser.getParent(locale); 591 if (parent == null || parent.equals(LocaleNames.ROOT)) { 592 break; 593 } 594 if (!parent.equals("en_001")) { // en_001 is generated later from en_GB 595 Level parentLevel = localesToLevels.get(parent); 596 assertTrue( 597 organization 598 + "; locale=" 599 + originalLevel 600 + "; level=" 601 + level 602 + "; parent=" 603 + parent 604 + "; level=" 605 + parentLevel, 606 parentLevel != null && parentLevel.compareTo(level) >= 0); 607 } 608 locale = parent; 609 } 610 } 611 } 612 } 613 } 614