• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.unittest;
2 
3 import java.util.Arrays;
4 import java.util.Collection;
5 import java.util.Collections;
6 import java.util.EnumSet;
7 import java.util.HashSet;
8 import java.util.LinkedHashSet;
9 import java.util.Map;
10 import java.util.Set;
11 import java.util.TreeMap;
12 import java.util.TreeSet;
13 
14 import org.junit.jupiter.api.DisplayNameGenerator.Standard;
15 import org.unicode.cldr.test.CoverageLevel2;
16 import org.unicode.cldr.tool.MinimizeRegex;
17 import org.unicode.cldr.util.CLDRConfig;
18 import org.unicode.cldr.util.CLDRFile;
19 import org.unicode.cldr.util.Counter;
20 import org.unicode.cldr.util.Factory;
21 import org.unicode.cldr.util.LanguageTagParser;
22 import org.unicode.cldr.util.Level;
23 import org.unicode.cldr.util.Organization;
24 import org.unicode.cldr.util.StandardCodes;
25 import org.unicode.cldr.util.StandardCodes.LstrType;
26 import org.unicode.cldr.util.SupplementalDataInfo;
27 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
28 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
29 import org.unicode.cldr.util.Validity;
30 import org.unicode.cldr.util.Validity.Status;
31 
32 import com.google.common.base.Joiner;
33 import com.google.common.collect.ImmutableMap;
34 import com.google.common.collect.ImmutableSet;
35 import com.google.common.collect.Multimap;
36 import com.google.common.collect.Multimaps;
37 import com.google.common.collect.Sets;
38 import com.google.common.collect.TreeMultimap;
39 import com.ibm.icu.text.UnicodeSet;
40 
41 public class TestCLDRLocaleCoverage extends TestFmwkPlus {
42     private static StandardCodes sc = StandardCodes.make();
43     private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance();
44     private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo();
45     private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish();
46 
47 
main(String[] args)48     public static void main(String[] args) {
49         new TestCLDRLocaleCoverage().run(args);
50     }
51 
TestLanguageNameCoverage()52     public void TestLanguageNameCoverage() {
53         // not sure why we need rhg here, since it is in seed it should be in mainLocales
54         Set<String> additionsToTranslate = new TreeSet<>(Arrays.asList("zxx", "ceb", "ny", "co", "ht", "hmn", "la", "sm", "st", "sa", "mul", "rhg"));
55 
56         Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language);
57         Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create());
58         Set<String> regular = (Set<String>) statusToLang.get(Status.regular);
59         Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build();
60         Set<String> valid = validity.keySet();
61 
62         Factory factory = CLDRCONFIG.getCldrFactory();
63         Set<String> mainLocales = new LinkedHashSet<>();
64         LanguageTagParser ltp = new LanguageTagParser();
65         for (String locale : factory.getAvailableLanguages()) {
66             String language = ltp.set(locale).getLanguage();
67             if (language.equals("root")) language = "und";
68             mainLocales.add(language);
69         }
70         mainLocales = ImmutableSet.copyOf(mainLocales);
71         Set<String> localesForNames = new TreeSet<>();
72         localesForNames.addAll(mainLocales);
73         localesForNames.addAll(additionsToTranslate);
74         localesForNames = ImmutableSet.copyOf(localesForNames);
75 
76         assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames);
77 
78         CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und");
79         Multimap<Level, String> levelToLanguage = TreeMultimap.create();
80         for (String locale : valid) {
81             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale);
82             Level level = coverageLeveler.getLevel(path);
83             levelToLanguage.put(level, locale);
84         }
85 
86         Set<String> coverageLocales = new TreeSet<>();
87         for (Level level : Level.values()) {
88             if (level == Level.COMPREHENSIVE) {
89                 continue;
90             }
91             //assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level));
92             coverageLocales.addAll(levelToLanguage.get(level));
93         }
94 
95         // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes.
96         // Usually a problem with coverage.
97         boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales);
98         showRegex |= !assertContains("coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?", coverageLocales, localesForNames);
99         if (showRegex || true) {
100             String simplePattern = MinimizeRegex.simplePattern(localesForNames);
101             warnln("Plain Regex for coverage:\n" + simplePattern);
102             warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]")));
103         }
104 
105         coverageLocales.addAll(SDI.getCLDRLanguageCodes());
106 
107         Map<String,Integer> official1M = getOfficial1M();
108         Set<String> official1MSet = new TreeSet<>();
109         for (String locale : official1M.keySet()) {
110             if (!localesForNames.contains(locale)) {
111                 official1MSet.add(locale);
112             }
113         }
114         warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet);
115 
116 
117 //        assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales);
118 //        assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales);
119 
120         coverageLocales.removeAll(mainLocales);
121         coverageLocales.removeAll(additionsToTranslate);
122 
123         for (String locale : localesForNames) {
124             logln("\n" + locale + "\t" + ENGLISH.getName(locale));
125         }
126 
127         logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder()));
128         logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder()));
129         logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder()));
130     }
131 
getOfficial1M()132     private Map<String,Integer> getOfficial1M() {
133         Counter<String> counter = new Counter<>();
134         for (String region : SDI.getTerritoriesWithPopulationData()) {
135             for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) {
136                 PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region);
137                 OfficialStatus status = popData.getOfficialStatus();
138                 if (status == OfficialStatus.unknown) {
139                     continue;
140                 }
141                 // we only care about names, so drop scripts
142                 int underbar = language.indexOf('_');
143                 if (underbar >= 0) {
144                     language = language.substring(0, underbar);
145                 }
146                 counter.add(language, (int) popData.getLiteratePopulation());
147             }
148         }
149         Map<String,Integer> result = new TreeMap<>();
150         for (String language : counter.keySet()) {
151             long litPop = counter.get(language);
152             if (litPop >= 1_000_000) {
153                 result.put(language, (int)litPop);
154             }
155 
156         }
157         return ImmutableMap.copyOf(result);
158     }
159 
composeList(Iterable<String> source, String separator, StringBuilder result)160     static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) {
161         String prefix = null;
162         for (String item : source) {
163             if (prefix == null || !item.startsWith(prefix)) {
164                 result.append(separator);
165                 prefix = item.substring(0,1); // only ascii
166             } else {
167                 result.append(' ');
168             }
169             result.append(item);
170         }
171         return result;
172     }
173 
assertContains(String title, Collection<String> set, Collection<String> subset)174     private boolean assertContains(String title, Collection<String> set, Collection<String> subset) {
175         boolean result = set.containsAll(subset);
176         if (!result) {
177             Set<String> temp = new LinkedHashSet<>(subset);
178             temp.removeAll(set);
179             Set<String> temp2 = new TreeSet<>();
180             for (String locale : temp) {
181                 temp2.add(locale + "\t" + ENGLISH.getName(locale));
182             }
183             errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2));
184         }
185         return result;
186     }
187 
188     /**
189      * Test whether there are any locales for the organization CLDR
190      */
TestCLDROrganizationPresence()191     public void TestCLDROrganizationPresence() {
192         Set<String> cldrLocales = sc.getLocaleCoverageLocales(
193             Organization.cldr, EnumSet.of(Level.MODERN));
194         assertNotNull("Expected CLDR modern locales not to be null",
195             cldrLocales);
196         assertTrue("Expected locales for CLDR, but found none.",
197             cldrLocales != null && !cldrLocales.isEmpty());
198     }
199 
200     /**
201      * Tests that cldr is a superset.
202      */
TestCldrSuperset()203     public void TestCldrSuperset() {
204         checkCldrLocales(Organization.apple, ERR);
205         checkCldrLocales(Organization.google, ERR);
206         checkCldrLocales(Organization.microsoft, WARN);
207     }
208 
209     /**
210      * Set of locales which are _excluded_ from Cldr-is-a-superset tests
211      */
212     static Set<String> SKIP_SUPERSET = ImmutableSet.of(
213         // Microsoft locales?
214         "to", "fo",
215         // Apple locales
216         "aa", "bo", "cad", "kl", "kok", "lb", "pcm", "tg", "tt",
217         // Google locales
218         "aa", "br", "gd", "ia", "kea", "nqo", "oc", "sc",
219         // Skip "*"
220         StandardCodes.ALL_LOCALES
221     );
222 
checkCldrLocales(Organization organization, int warningLevel)223     private void checkCldrLocales(Organization organization, int warningLevel) {
224         // use a union, so that items can be higher
225         EnumSet<Level> modernModerate = EnumSet.of(Level.MODERATE, Level.MODERN);
226 
227         Set<String> orgLocalesModerate = sc.getLocaleCoverageLocales(organization, modernModerate);
228         Set<String> cldrLocalesModerate = sc.getLocaleCoverageLocales(Organization.cldr, modernModerate);
229         Set<String> failures = checkCldrLocalesSuperset(modernModerate, cldrLocalesModerate, organization, orgLocalesModerate, warningLevel,
230             SKIP_SUPERSET);
231 
232         EnumSet<Level> modernSet = EnumSet.of(Level.MODERN);
233         Set<String> orgLocalesModern = sc.getLocaleCoverageLocales(organization, modernSet);
234         Set<String> cldrLocalesModern = sc.getLocaleCoverageLocales(Organization.cldr, modernSet);
235         failures = new HashSet<>(failures);
236         failures.addAll(SKIP_SUPERSET);
237         checkCldrLocalesSuperset(modernSet, cldrLocalesModern, organization, orgLocalesModern, warningLevel, failures);
238     }
239 
checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel, Set<String> skip)240     private Set<String> checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel,
241         Set<String> skip) {
242         if (!cldrLocales.containsAll(orgLocales)) {
243             Set<String> diff2 = new LinkedHashSet<>(Sets.difference(orgLocales, cldrLocales));
244             diff2.removeAll(skip);
245             if (!diff2.isEmpty()) {
246                 String diffString = diff2.toString();
247                 String levelString = Joiner.on("+").join(level);
248                 for (String localeId : diff2) {
249                     diffString += "\n\t" + localeId + "\t" + CLDRConfig.getInstance().getEnglish().getName(localeId);
250                 }
251                 msg("The following " + organization.displayName + " " + levelString + " locales were absent from the "
252                     + Organization.cldr.displayName + " " + levelString + " locales:" + diffString,
253                     warningLevel, true, true);
254             }
255             return diff2;
256         }
257         return Collections.emptySet();
258     }
259 }
260