• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.unittest;
2 
3 import java.util.Collection;
4 import java.util.Collections;
5 import java.util.EnumSet;
6 import java.util.LinkedHashSet;
7 import java.util.Map;
8 import java.util.Map.Entry;
9 import java.util.Set;
10 import java.util.TreeMap;
11 import java.util.TreeSet;
12 import java.util.stream.Collectors;
13 
14 import org.unicode.cldr.test.CoverageLevel2;
15 import org.unicode.cldr.tool.MinimizeRegex;
16 import org.unicode.cldr.util.CLDRConfig;
17 import org.unicode.cldr.util.CLDRFile;
18 import org.unicode.cldr.util.CLDRLocale;
19 import org.unicode.cldr.util.Counter;
20 import org.unicode.cldr.util.Factory;
21 import org.unicode.cldr.util.LanguageTagParser;
22 import org.unicode.cldr.util.Level;
23 import org.unicode.cldr.util.LocaleIDParser;
24 import org.unicode.cldr.util.Organization;
25 import org.unicode.cldr.util.StandardCodes;
26 import org.unicode.cldr.util.StandardCodes.LstrType;
27 import org.unicode.cldr.util.SupplementalDataInfo;
28 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
29 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
30 import org.unicode.cldr.util.Validity;
31 import org.unicode.cldr.util.Validity.Status;
32 
33 import com.google.common.base.Joiner;
34 import com.google.common.collect.ImmutableMap;
35 import com.google.common.collect.ImmutableSet;
36 import com.google.common.collect.ImmutableSortedSet;
37 import com.google.common.collect.Multimap;
38 import com.google.common.collect.Multimaps;
39 import com.google.common.collect.Sets;
40 import com.google.common.collect.TreeMultimap;
41 import com.ibm.icu.text.UnicodeSet;
42 
43 public class TestCLDRLocaleCoverage extends TestFmwkPlus {
44     private static StandardCodes sc = StandardCodes.make();
45     private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance();
46     private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo();
47     private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish();
48 
49 
main(String[] args)50     public static void main(String[] args) {
51         new TestCLDRLocaleCoverage().run(args);
52     }
53 
TestLanguageNameCoverage()54     public void TestLanguageNameCoverage() {
55         // mainLocales has the locales in common/main, which is basically the set in attributeValueValidity.xml $language..
56         // We add in additionsToTranslate below the set in attributeValueValidity.xml $languageExceptions
57         // (both sets are included in SDI.getCLDRLanguageCodes() but we do not use that until later).
58         Set<String> additionsToTranslate = ImmutableSortedSet.of("zxx", "mul",
59             "ab", "ace", "ada", "ady", "ain", "ale", "alt", "an", "anp", "arn", "arp", "ars", "atj", "av", "awa", "ay",
60             "ba", "ban", "bho", "bi", "bin", "bla", "bug", "byn",
61             "cay", "ch", "chk", "chm", "cho", "chp", "chy", "clc", "co", "crg", "crj", "crk", "crl", "crm", "crr", "csw", "cv",
62             "dak", "dar", "dgr", "dv", "dzg",
63             "efi", "eka",
64             "fj", "fon", "frc",
65             "gaa", "gez", "gil", "gn", "gor", "gwi",
66             "hai", "hax", "hil", "hmn", "ht", "hup", "hur", "hz",
67             "iba", "ibb", "ikt", "ilo", "inh", "io", "iu",
68             "jbo",
69             "kac", "kaj", "kbd", "kcg", "kfo", "kha", "kj", "kmb", "kpe", "kr", "krc", "krl", "kru", "kum", "kv", "kwk",
70             "la", "lad", "lez", "li", "lil", "lou", "loz", "lsm", "lua", "lun", "lus",
71             "mad", "mag", "mak", "mdf", "men", "mh", "mic", "min", "moe", "moh", "mos", "mus", "mwl", "myv",
72             "na", "nap", "new", "ng", "nia", "niu", "nog", "nqo", "nr", "nso", "nv", "ny",
73             "oc", "ojb", "ojc", "ojs", "ojw", "oka",
74             "pag", "pam", "pap", "pau", "pqm",
75             "rap", "rar", "rhg", "rup",
76             "sad", "sba", "scn", "sco", "shn", "slh", "sm", "snk", "srn", "ss", "st", "str", "suk", "swb", "syr",
77             "tce", "tem", "tet", "tgx", "tht", "tig", "tlh", "tli", "tn", "tpi", "trv", "ts", "ttm", "tum", "tvl", "ty", "tyv",
78             "udm", "umb",
79             "ve",
80             "wa", "wal", "war", "wuu",
81             "xal",
82             "ybb",
83             "zun", "zza" );
84 
85         warnln("Locale names added for translation; revisit each release:\n"
86             + Joiner.on("\n")
87             .join(additionsToTranslate.stream().map(x -> x + "\t(" + ENGLISH.getName(x) + ")").collect(Collectors.toList())));
88 
89         Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language);
90         Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create());
91         Set<String> regular = (Set<String>) statusToLang.get(Status.regular);
92         Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build();
93         Set<String> valid = validity.keySet();
94 
95         Factory factory = CLDRCONFIG.getCldrFactory();
96         Set<String> mainLocales = new LinkedHashSet<>();
97         LanguageTagParser ltp = new LanguageTagParser();
98         for (String locale : factory.getAvailableLanguages()) {
99             String language = ltp.set(locale).getLanguage();
100             if (language.equals("root")) language = "und";
101             mainLocales.add(language);
102         }
103         mainLocales = ImmutableSet.copyOf(mainLocales);
104         Set<String> localesForNames = new TreeSet<>();
105         localesForNames.addAll(mainLocales);
106         localesForNames.addAll(additionsToTranslate);
107         localesForNames = ImmutableSet.copyOf(localesForNames);
108 
109         assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames);
110 
111         CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und");
112         Multimap<Level, String> levelToLanguage = TreeMultimap.create();
113         for (String locale : valid) {
114             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale);
115             Level level = coverageLeveler.getLevel(path);
116             levelToLanguage.put(level, locale);
117         }
118 
119         Set<String> coverageLocales = new TreeSet<>();
120         for (Level level : Level.values()) {
121             if (level == Level.COMPREHENSIVE) {
122                 continue;
123             }
124             //assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level));
125             coverageLocales.addAll(levelToLanguage.get(level));
126         }
127         if (logKnownIssue("CLDR-15888", "modern coverage not yet updated for bgc, raj")) {
128             coverageLocales.add("bgc");
129             coverageLocales.add("raj");
130         }
131 
132         // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes.
133         // Usually a problem with coverage.
134         boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales);
135         showRegex |= !assertContains("coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?", coverageLocales, localesForNames);
136         if (showRegex || true) {
137             String simplePattern = MinimizeRegex.simplePattern(localesForNames);
138             warnln("Plain Regex for coverage:\n" + simplePattern);
139             warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]")));
140         }
141 
142         coverageLocales.addAll(SDI.getCLDRLanguageCodes());
143 
144         Map<String,Integer> official1M = getOfficial1M();
145         Set<String> official1MSet = new TreeSet<>();
146         for (String locale : official1M.keySet()) {
147             if (!localesForNames.contains(locale)) {
148                 official1MSet.add(locale);
149             }
150         }
151         warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet);
152 
153 
154 //        assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales);
155 //        assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales);
156 
157         coverageLocales.removeAll(mainLocales);
158         coverageLocales.removeAll(additionsToTranslate);
159 
160         for (String locale : localesForNames) {
161             logln("\n" + locale + "\t" + ENGLISH.getName(locale));
162         }
163 
164         logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder()));
165         logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder()));
166         logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder()));
167     }
168 
getOfficial1M()169     private Map<String,Integer> getOfficial1M() {
170         Counter<String> counter = new Counter<>();
171         for (String region : SDI.getTerritoriesWithPopulationData()) {
172             for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) {
173                 PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region);
174                 OfficialStatus status = popData.getOfficialStatus();
175                 if (status == OfficialStatus.unknown) {
176                     continue;
177                 }
178                 // we only care about names, so drop scripts
179                 int underbar = language.indexOf('_');
180                 if (underbar >= 0) {
181                     language = language.substring(0, underbar);
182                 }
183                 counter.add(language, (int) popData.getLiteratePopulation());
184             }
185         }
186         Map<String,Integer> result = new TreeMap<>();
187         for (String language : counter.keySet()) {
188             long litPop = counter.get(language);
189             if (litPop >= 1_000_000) {
190                 result.put(language, (int)litPop);
191             }
192 
193         }
194         return ImmutableMap.copyOf(result);
195     }
196 
composeList(Iterable<String> source, String separator, StringBuilder result)197     static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) {
198         String prefix = null;
199         for (String item : source) {
200             if (prefix == null || !item.startsWith(prefix)) {
201                 result.append(separator);
202                 prefix = item.substring(0,1); // only ascii
203             } else {
204                 result.append(' ');
205             }
206             result.append(item);
207         }
208         return result;
209     }
210 
assertContains(String title, Collection<String> set, Collection<String> subset)211     private boolean assertContains(String title, Collection<String> set, Collection<String> subset) {
212         boolean result = set.containsAll(subset);
213         if (!result) {
214             Set<String> temp = new LinkedHashSet<>(subset);
215             temp.removeAll(set);
216             Set<String> temp2 = new TreeSet<>();
217             for (String locale : temp) {
218                 temp2.add(locale + "\t" + ENGLISH.getName(locale));
219             }
220             errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2));
221         }
222         return result;
223     }
224 
225     /**
226      * Test whether there are any locales for the organization CLDR
227      */
TestCLDROrganizationPresence()228     public void TestCLDROrganizationPresence() {
229         Set<String> cldrLocales = sc.getLocaleCoverageLocales(
230             Organization.cldr, EnumSet.of(Level.MODERN));
231         assertNotNull("Expected CLDR modern locales not to be null",
232             cldrLocales);
233         assertTrue("Expected locales for CLDR, but found none.",
234             cldrLocales != null && !cldrLocales.isEmpty());
235     }
236 
237     /**
238      * Tests that cldr+special is a superset of the TC locales, with the right levels
239      */
TestCldrSuperset()240     public void TestCldrSuperset() {
241         Map<String, Level> apple = sc.getLocalesToLevelsFor(Organization.apple);
242         Map<String, Level> google = sc.getLocalesToLevelsFor(Organization.google);
243         Map<String, Level> microsoft = sc.getLocalesToLevelsFor(Organization.microsoft);
244         Map<String, Level> special = sc.getLocalesToLevelsFor(Organization.special);
245 
246         Map<String, Level> cldr = sc.getLocalesToLevelsFor(Organization.cldr);
247 
248         // check that the cldr locales (+ special) have the max level of the TC locales
249 
250         for (Entry<String, Level> entry : cldr.entrySet()) {
251             String locale = entry.getKey();
252             Level cldrLevel = entry.getValue();
253             Level appleLevel = apple.get(locale);
254             Level googleLevel = google.get(locale);
255             Level microsoftLevel = microsoft.get(locale);
256             Level specialLevel = special.get(locale);
257 
258             // check the 8 vote count
259 
260             int count = getLevelCount(appleLevel)
261                 + getLevelCount(googleLevel)
262                 + getLevelCount(microsoftLevel)
263                 ;
264             int defaultVotes = SupplementalDataInfo.getInstance().getRequiredVotes(CLDRLocale.getInstance(locale), null);
265             assertEquals("8 votes for " + locale + " at " + cldrLevel, count > 2 && cldrLevel.compareTo(Level.MODERN) >= 0, defaultVotes == 8);
266 
267             // check the max level
268 
269             Level maxLevel = Level.max(appleLevel, googleLevel, microsoftLevel, specialLevel);
270             assertEquals("cldr level = max for " + locale + " (" + ENGLISH.getName(locale) + ")", cldrLevel, maxLevel);
271         }
272 
273         // check that the cldr locales include all of the other locale's
274 
275         checkCldrContains("cldr", cldr, "apple", apple);
276         checkCldrContains("cldr", cldr, "google", google);
277         checkCldrContains("cldr", cldr, "microsoft", microsoft);
278         checkCldrContains("cldr", cldr, "special", apple);
279 
280         // check that special doesn't overlap with TC, except for generated locales
281 
282         checkDisjoint("special", special, "apple", apple);
283         checkDisjoint("special", special, "google", google);
284         checkDisjoint("special", special, "microsoft", microsoft);
285     }
286 
getLevelCount(Level appleLevel)287     private int getLevelCount(Level appleLevel) {
288         return appleLevel == null ? 0
289             : appleLevel.compareTo(Level.MODERN) >= 0 ? 1 : 0;
290     }
291 
292     private static final Set<String> ANY_LOCALE_SET = ImmutableSet.of("*");
293     private static final Set<String> LOCALE_CONTAINMENT_EXCEPTIONS = ImmutableSet.of(
294         "sr_Latn", // auto-generated
295         "hi", "sr", "yue" // these are inserted by Locales.txt processing TODO don't add to special
296         );
297 
checkCldrContains(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)298     private void checkCldrContains(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other) {
299         assertEquals(firstName + " ⊇ " + otherName, Collections.emptySet(), Sets.difference(Sets.difference(other.keySet(), ANY_LOCALE_SET), first.keySet()));
300     }
301 
checkDisjoint(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)302     private void checkDisjoint(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other) {
303         assertEquals(firstName + " ⩃ " + otherName, Collections.emptySet(), Sets.difference(Sets.intersection(other.keySet(), first.keySet()), LOCALE_CONTAINMENT_EXCEPTIONS));
304     }
305 
TestParentCoverage()306     public void TestParentCoverage() {
307         for (Organization organization : sc.getLocaleCoverageOrganizations()) {
308             if (organization == Organization.special) {
309                 continue;
310             }
311             final Map<String, Level> localesToLevels = sc.getLocalesToLevelsFor(organization);
312             for (Entry<String, Level> localeAndLevel : localesToLevels.entrySet()) {
313                 String originalLevel = localeAndLevel.getKey();
314                 Level level = localeAndLevel.getValue();
315                 String locale = originalLevel;
316                 while (true) {
317                     String parent = LocaleIDParser.getParent(locale);
318                     if (parent == null || parent.equals("root")) {
319                         break;
320                     }
321                     if (!parent.equals("en_001")) { // en_001 is generated later from en_GB
322                         Level parentLevel = localesToLevels.get(parent);
323                         assertTrue(organization
324                             + "; locale=" + originalLevel
325                             + "; level=" + level
326                             + "; parent=" + parent
327                             + "; level=" + parentLevel,
328                             parentLevel != null && parentLevel.compareTo(level) >= 0);
329                     }
330                     locale = parent;
331                 }
332             }
333         }
334     }
335 }
336