• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.unittest;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.collect.ImmutableMap;
5 import com.google.common.collect.ImmutableSet;
6 import com.google.common.collect.ImmutableSortedSet;
7 import com.google.common.collect.Multimap;
8 import com.google.common.collect.Multimaps;
9 import com.google.common.collect.Sets;
10 import com.google.common.collect.TreeMultimap;
11 import com.ibm.icu.text.UnicodeSet;
12 import java.util.Collection;
13 import java.util.Collections;
14 import java.util.EnumSet;
15 import java.util.LinkedHashSet;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 import java.util.stream.Collectors;
22 import org.unicode.cldr.test.CoverageLevel2;
23 import org.unicode.cldr.tool.MinimizeRegex;
24 import org.unicode.cldr.util.CLDRConfig;
25 import org.unicode.cldr.util.CLDRFile;
26 import org.unicode.cldr.util.CLDRLocale;
27 import org.unicode.cldr.util.Counter;
28 import org.unicode.cldr.util.Factory;
29 import org.unicode.cldr.util.LanguageTagParser;
30 import org.unicode.cldr.util.Level;
31 import org.unicode.cldr.util.LocaleIDParser;
32 import org.unicode.cldr.util.LocaleNames;
33 import org.unicode.cldr.util.Organization;
34 import org.unicode.cldr.util.StandardCodes;
35 import org.unicode.cldr.util.StandardCodes.LstrType;
36 import org.unicode.cldr.util.SupplementalDataInfo;
37 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
38 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
39 import org.unicode.cldr.util.Validity;
40 import org.unicode.cldr.util.Validity.Status;
41 import org.unicode.cldr.util.VoteResolver;
42 
43 public class TestCLDRLocaleCoverage extends TestFmwkPlus {
44     private static StandardCodes sc = StandardCodes.make();
45     private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance();
46     private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo();
47     private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish();
48 
main(String[] args)49     public static void main(String[] args) {
50         new TestCLDRLocaleCoverage().run(args);
51     }
52 
TestLanguageNameCoverage()53     public void TestLanguageNameCoverage() {
54         // mainLocales has the locales in common/main, which is basically the set in
55         // attributeValueValidity.xml $language..
56         // We add in additionsToTranslate below the set in attributeValueValidity.xml
57         // $languageExceptions
58         // (both sets are included in SDI.getCLDRLanguageCodes() but we do not use that until
59         // later).
60         Set<String> additionsToTranslate =
61                 ImmutableSortedSet.of(
62                         LocaleNames.ZXX,
63                         LocaleNames.MUL,
64                         "ab",
65                         "ace",
66                         "ada",
67                         "ady",
68                         "ain",
69                         "ale",
70                         "alt",
71                         "an",
72                         "anp",
73                         "arn",
74                         "arp",
75                         "ars",
76                         "atj",
77                         "av",
78                         "awa",
79                         "ay",
80                         "ba",
81                         "ban",
82                         "bho",
83                         "bi",
84                         "bin",
85                         "bla",
86                         "bug",
87                         "byn",
88                         "cay",
89                         "ch",
90                         "chk",
91                         "chm",
92                         "cho",
93                         "chp",
94                         "chy",
95                         "clc",
96                         "co",
97                         "crg",
98                         "crj",
99                         "crk",
100                         "crl",
101                         "crm",
102                         "crr",
103                         "csw",
104                         "cv",
105                         "dak",
106                         "dar",
107                         "dgr",
108                         "dv",
109                         "dzg",
110                         "efi",
111                         "eka",
112                         "fj",
113                         "fon",
114                         "frc",
115                         "gaa",
116                         "gez",
117                         "gil",
118                         "gn",
119                         "gor",
120                         "gwi",
121                         "hai",
122                         "hax",
123                         "hil",
124                         "hmn",
125                         "ht",
126                         "hup",
127                         "hur",
128                         "hz",
129                         "iba",
130                         "ibb",
131                         "ikt",
132                         "ilo",
133                         "inh",
134                         "io",
135                         "iu",
136                         "jbo",
137                         "kac",
138                         "kaj",
139                         "kbd",
140                         "kcg",
141                         "kfo",
142                         "kha",
143                         "kj",
144                         "kmb",
145                         "kpe",
146                         "kr",
147                         "krc",
148                         "krl",
149                         "kru",
150                         "kum",
151                         "kv",
152                         "kwk",
153                         "la",
154                         "lad",
155                         "lez",
156                         "li",
157                         "lil",
158                         "lou",
159                         "loz",
160                         "lsm",
161                         "lua",
162                         "lun",
163                         "lus",
164                         "mad",
165                         "mag",
166                         "mak",
167                         "mdf",
168                         "men",
169                         "mh",
170                         "mic",
171                         "min",
172                         "moe",
173                         "moh",
174                         "mos",
175                         "mus",
176                         "mwl",
177                         "myv",
178                         "na",
179                         "nap",
180                         "new",
181                         "ng",
182                         "nia",
183                         "niu",
184                         "nog",
185                         "nqo",
186                         "nr",
187                         "nso",
188                         "nv",
189                         "ny",
190                         "oc",
191                         "ojb",
192                         "ojc",
193                         "ojs",
194                         "ojw",
195                         "oka",
196                         "pag",
197                         "pam",
198                         "pap",
199                         "pau",
200                         "pqm",
201                         "rap",
202                         "rar",
203                         "rhg",
204                         "rup",
205                         "sad",
206                         "sba",
207                         "scn",
208                         "sco",
209                         "shn",
210                         "slh",
211                         "sm",
212                         "snk",
213                         "srn",
214                         "ss",
215                         "st",
216                         "str",
217                         "suk",
218                         "swb",
219                         "syr",
220                         "tce",
221                         "tem",
222                         "tet",
223                         "tgx",
224                         "tht",
225                         "tig",
226                         "tlh",
227                         "tli",
228                         "tn",
229                         "tpi",
230                         "trv",
231                         "ts",
232                         "ttm",
233                         "tum",
234                         "tvl",
235                         "ty",
236                         "tyv",
237                         "udm",
238                         "umb",
239                         "ve",
240                         "wa",
241                         "wal",
242                         "war",
243                         "wuu",
244                         "xal",
245                         "ybb",
246                         "zun",
247                         "zza");
248 
249         Set<String> removalsForLateBasics =
250                 Set.of(
251                         "blo", "eo", "ie", "kxv", "lij", "lmo", "nds", "prg", "szl", "tok", "vec",
252                         "vmw", "xnr", "za");
253 
254         warnln(
255                 "Locale names added for translation; revisit each release:\n"
256                         + Joiner.on("\n")
257                                 .join(
258                                         additionsToTranslate.stream()
259                                                 .map(x -> x + "\t(" + ENGLISH.getName(x) + ")")
260                                                 .collect(Collectors.toList())));
261 
262         Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language);
263         Multimap<Status, String> statusToLang =
264                 Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create());
265         Set<String> regular = (Set<String>) statusToLang.get(Status.regular);
266         Set<String> regularPlus =
267                 ImmutableSet.<String>builder()
268                         .addAll(regular)
269                         .add(LocaleNames.UND)
270                         .add(LocaleNames.ZXX)
271                         .add(LocaleNames.MUL)
272                         .build();
273         Set<String> valid = validity.keySet();
274 
275         Factory factory = CLDRCONFIG.getCldrFactory();
276         Set<String> mainLocales = new LinkedHashSet<>();
277         LanguageTagParser ltp = new LanguageTagParser();
278         for (String locale : factory.getAvailableLanguages()) {
279             String language = ltp.set(locale).getLanguage();
280             if (language.equals(LocaleNames.ROOT)) {
281                 language = LocaleNames.UND;
282             } else if (!StandardCodes.isLocaleAtLeastBasic(language)) {
283                 continue;
284             }
285             mainLocales.add(language);
286         }
287         mainLocales = ImmutableSet.copyOf(mainLocales);
288         Set<String> localesForNames = new TreeSet<>();
289         localesForNames.addAll(mainLocales);
290         localesForNames.addAll(additionsToTranslate);
291         localesForNames.removeAll(removalsForLateBasics);
292         localesForNames = ImmutableSet.copyOf(localesForNames);
293 
294         assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames);
295 
296         CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance(LocaleNames.UND);
297         Multimap<Level, String> levelToLanguage = TreeMultimap.create();
298         for (String locale : valid) {
299             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale);
300             Level level = coverageLeveler.getLevel(path);
301             levelToLanguage.put(level, locale);
302         }
303 
304         Set<String> coverageLocales = new TreeSet<>();
305         for (Level level : Level.values()) {
306             if (level == Level.COMPREHENSIVE) {
307                 continue;
308             }
309             // assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames,
310             // levelToLanguage.get(level));
311             coverageLocales.addAll(levelToLanguage.get(level));
312         }
313 
314         // added for CLDR-15888
315         coverageLocales.add("bgc");
316         coverageLocales.add("raj");
317 
318         // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes.
319         // Usually a problem with coverage.
320         boolean showRegex =
321                 !assertContains(
322                         "localesForNames.containsAll(coverageLocales)",
323                         localesForNames,
324                         coverageLocales);
325         showRegex |=
326                 !assertContains(
327                         "coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?",
328                         coverageLocales, localesForNames);
329         if (showRegex || true) {
330             String simplePattern = MinimizeRegex.simplePattern(localesForNames);
331             warnln("Plain Regex for coverage:\n" + simplePattern);
332             warnln(
333                     "Compact Regex for coverage:\n"
334                             + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]")));
335         }
336 
337         coverageLocales.addAll(SDI.getCLDRLanguageCodes());
338 
339         Map<String, Integer> official1M = getOfficial1M();
340         Set<String> official1MSet = new TreeSet<>();
341         for (String locale : official1M.keySet()) {
342             if (!localesForNames.contains(locale)) {
343                 official1MSet.add(locale);
344             }
345         }
346         warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet);
347 
348         //        assertContains("sdiLocales contains oldModernLocales", sdiLocales,
349         // oldModernLocales);
350         //        assertContains("oldModernLocales contains sdiLocales", oldModernLocales,
351         // sdiLocales);
352 
353         coverageLocales.removeAll(mainLocales);
354         coverageLocales.removeAll(additionsToTranslate);
355 
356         for (String locale : localesForNames) {
357             logln("\n" + locale + "\t" + ENGLISH.getName(locale));
358         }
359 
360         logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder()));
361         logln(
362                 "\nadditionsToTranslate:"
363                         + composeList(additionsToTranslate, "\n\t", new StringBuilder()));
364         logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder()));
365     }
366 
getOfficial1M()367     private Map<String, Integer> getOfficial1M() {
368         Counter<String> counter = new Counter<>();
369         for (String region : SDI.getTerritoriesWithPopulationData()) {
370             for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) {
371                 PopulationData popData =
372                         SDI.getLanguageAndTerritoryPopulationData(language, region);
373                 OfficialStatus status = popData.getOfficialStatus();
374                 if (status == OfficialStatus.unknown) {
375                     continue;
376                 }
377                 // we only care about names, so drop scripts
378                 int underbar = language.indexOf('_');
379                 if (underbar >= 0) {
380                     language = language.substring(0, underbar);
381                 }
382                 counter.add(language, (int) popData.getLiteratePopulation());
383             }
384         }
385         Map<String, Integer> result = new TreeMap<>();
386         for (String language : counter.keySet()) {
387             long litPop = counter.get(language);
388             if (litPop >= 1_000_000) {
389                 result.put(language, (int) litPop);
390             }
391         }
392         return ImmutableMap.copyOf(result);
393     }
394 
composeList( Iterable<String> source, String separator, StringBuilder result)395     static final StringBuilder composeList(
396             Iterable<String> source, String separator, StringBuilder result) {
397         String prefix = null;
398         for (String item : source) {
399             if (prefix == null || !item.startsWith(prefix)) {
400                 result.append(separator);
401                 prefix = item.substring(0, 1); // only ascii
402             } else {
403                 result.append(' ');
404             }
405             result.append(item);
406         }
407         return result;
408     }
409 
assertContains( String title, Collection<String> set, Collection<String> subset)410     private boolean assertContains(
411             String title, Collection<String> set, Collection<String> subset) {
412         set = removeBelowBasic(set);
413         subset = removeBelowBasic(subset);
414         boolean result = set.containsAll(subset);
415         if (!result) {
416             Set<String> temp = new LinkedHashSet<>(subset);
417             temp.removeAll(set);
418             Set<String> temp2 = new TreeSet<>();
419             for (String locale : temp) {
420                 temp2.add(locale + "\t" + ENGLISH.getName(locale));
421             }
422             errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2));
423         }
424         return result;
425     }
426 
removeBelowBasic(Collection<String> set)427     private Collection<String> removeBelowBasic(Collection<String> set) {
428         Collection<String> set2 = new TreeSet<>();
429         for (String locale : set) {
430             if (StandardCodes.isLocaleAtLeastBasic(locale)) {
431                 set2.add(locale);
432             }
433         }
434         return set2;
435     }
436 
437     /** Test whether there are any locales for the organization CLDR */
TestCLDROrganizationPresence()438     public void TestCLDROrganizationPresence() {
439         Set<String> cldrLocales =
440                 sc.getLocaleCoverageLocales(Organization.cldr, EnumSet.of(Level.MODERN));
441         assertNotNull("Expected CLDR modern locales not to be null", cldrLocales);
442         assertTrue(
443                 "Expected locales for CLDR, but found none.",
444                 cldrLocales != null && !cldrLocales.isEmpty());
445     }
446 
447     /** Tests that cldr+special is a superset of the TC locales, with the right levels */
TestCldrSuperset()448     public void TestCldrSuperset() {
449         final Set<Organization> orgs = Organization.getTCOrgs();
450 
451         Map<Organization, Map<String, Level>> orgToLevels = new TreeMap<>();
452         orgs.forEach(org -> orgToLevels.put(org, sc.getLocalesToLevelsFor(org)));
453 
454         Map<String, Level> special = sc.getLocalesToLevelsFor(Organization.special);
455 
456         Map<String, Level> cldr = sc.getLocalesToLevelsFor(Organization.cldr);
457 
458         // check that the cldr locales (+ special) have the max level of the TC locales
459 
460         for (Entry<String, Level> entry : cldr.entrySet()) {
461             final String locale = entry.getKey();
462 
463             final Map<Organization, Level> orgToLevel =
464                     orgToLevels.entrySet().stream()
465                             .collect(
466                                     Collectors.toMap(
467                                             Entry::getKey,
468                                             v -> {
469                                                 final Level l = v.getValue().get(locale);
470                                                 if (l == null) return Level.UNDETERMINED;
471                                                 return l;
472                                             }));
473 
474             Level cldrLevel = entry.getValue();
475             Level specialLevel = special.get(locale);
476             boolean cldrLevelIsModern = cldrLevel.compareTo(Level.MODERN) >= 0;
477 
478             // check the vote count
479 
480             final int count =
481                     (int)
482                             orgToLevel.values().stream()
483                                     .filter(TestCLDRLocaleCoverage::isPresentAndAtLeastModern)
484                                     .count();
485             final int countMin = 2;
486             final boolean countAtLeast = count > countMin;
487             int defaultVotes =
488                     SupplementalDataInfo.getInstance()
489                             .getRequiredVotes(CLDRLocale.getInstance(locale), null);
490 
491             if (countAtLeast && cldrLevelIsModern) {
492                 assertEquals(
493                         "orgCount="
494                                 + count
495                                 + ", and cldrLevel="
496                                 + cldrLevel
497                                 + ", expected LOWER_BAR but it wasn't for "
498                                 + locale,
499                         VoteResolver.LOWER_BAR,
500                         defaultVotes);
501             } else {
502                 assertNotEquals(
503                         "orgCount="
504                                 + count
505                                 + ", and cldrLevel="
506                                 + cldrLevel
507                                 + ", expected "
508                                 + locale
509                                 + " to NOT have LOWER_BAR",
510                         VoteResolver.LOWER_BAR,
511                         defaultVotes);
512             }
513 
514             // check the max level
515             Level maxLevel =
516                     Level.max(specialLevel, Level.max(orgToLevel.values().toArray(new Level[0])));
517             assertEquals(
518                     "cldr level = max for " + locale + " (" + ENGLISH.getName(locale) + ")",
519                     cldrLevel,
520                     maxLevel);
521         }
522 
523         // check that the cldr locales include all of the other locale's
524         orgToLevels
525                 .entrySet()
526                 .forEach(
527                         e -> {
528                             final Organization org = e.getKey();
529                             final Map<String, Level> l = e.getValue();
530                             checkCldrContains("cldr", cldr, org.name(), l);
531                             checkCldrContains("cldr", cldr, "special", l);
532                             // check that special doesn't overlap with TC, except for locales in
533                             // LOCALE_CONTAINMENT_EXCEPTIONS
534                             checkDisjoint("special", special, org.name(), l);
535                         });
536     }
537 
isPresentAndAtLeastModern(Level orgLevel)538     private static boolean isPresentAndAtLeastModern(Level orgLevel) {
539         return orgLevel == Level.UNDETERMINED
540                 ? false
541                 : orgLevel.compareTo(Level.MODERN) >= 0 ? true : false;
542     }
543 
544     private static final Set<String> ANY_LOCALE_SET = ImmutableSet.of("*");
545     private static final Set<String> LOCALE_CONTAINMENT_EXCEPTIONS =
546             ImmutableSet.of(
547                     "sr_Latn", // auto-generated
548                     "hi",
549                     "sr",
550                     "yue", // these are inserted by Locales.txt processing TODO don't add to special
551                     "to",
552                     "qu" // optional locales
553                     );
554 
checkCldrContains( String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)555     private void checkCldrContains(
556             String firstName,
557             Map<String, Level> first,
558             String otherName,
559             Map<String, Level> other) {
560         assertEquals(
561                 firstName + " ⊇ " + otherName,
562                 Collections.emptySet(),
563                 Sets.difference(Sets.difference(other.keySet(), ANY_LOCALE_SET), first.keySet()));
564     }
565 
checkDisjoint( String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)566     private void checkDisjoint(
567             String firstName,
568             Map<String, Level> first,
569             String otherName,
570             Map<String, Level> other) {
571         assertEquals(
572                 firstName + " ⩃ " + otherName,
573                 Collections.emptySet(),
574                 Sets.difference(
575                         Sets.intersection(other.keySet(), first.keySet()),
576                         LOCALE_CONTAINMENT_EXCEPTIONS));
577     }
578 
TestParentCoverage()579     public void TestParentCoverage() {
580         for (Organization organization : sc.getLocaleCoverageOrganizations()) {
581             if (organization == Organization.special) {
582                 continue;
583             }
584             final Map<String, Level> localesToLevels = sc.getLocalesToLevelsFor(organization);
585             for (Entry<String, Level> localeAndLevel : localesToLevels.entrySet()) {
586                 String originalLevel = localeAndLevel.getKey();
587                 Level level = localeAndLevel.getValue();
588                 String locale = originalLevel;
589                 while (true) {
590                     String parent = LocaleIDParser.getParent(locale);
591                     if (parent == null || parent.equals(LocaleNames.ROOT)) {
592                         break;
593                     }
594                     if (!parent.equals("en_001")) { // en_001 is generated later from en_GB
595                         Level parentLevel = localesToLevels.get(parent);
596                         assertTrue(
597                                 organization
598                                         + "; locale="
599                                         + originalLevel
600                                         + "; level="
601                                         + level
602                                         + "; parent="
603                                         + parent
604                                         + "; level="
605                                         + parentLevel,
606                                 parentLevel != null && parentLevel.compareTo(level) >= 0);
607                     }
608                     locale = parent;
609                 }
610             }
611         }
612     }
613 }
614