• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.unittest;
2 
3 import java.util.Arrays;
4 import java.util.Collection;
5 import java.util.Collections;
6 import java.util.EnumSet;
7 import java.util.HashSet;
8 import java.util.LinkedHashSet;
9 import java.util.Map;
10 import java.util.Set;
11 import java.util.TreeMap;
12 import java.util.TreeSet;
13 
14 import org.unicode.cldr.test.CoverageLevel2;
15 import org.unicode.cldr.tool.MinimizeRegex;
16 import org.unicode.cldr.util.CLDRConfig;
17 import org.unicode.cldr.util.CLDRFile;
18 import org.unicode.cldr.util.Counter;
19 import org.unicode.cldr.util.Factory;
20 import org.unicode.cldr.util.LanguageTagParser;
21 import org.unicode.cldr.util.Level;
22 import org.unicode.cldr.util.Organization;
23 import org.unicode.cldr.util.StandardCodes;
24 import org.unicode.cldr.util.StandardCodes.LstrType;
25 import org.unicode.cldr.util.SupplementalDataInfo;
26 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
27 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
28 import org.unicode.cldr.util.Validity;
29 import org.unicode.cldr.util.Validity.Status;
30 
31 import com.google.common.base.Joiner;
32 import com.google.common.collect.ImmutableMap;
33 import com.google.common.collect.ImmutableSet;
34 import com.google.common.collect.Multimap;
35 import com.google.common.collect.Multimaps;
36 import com.google.common.collect.Sets;
37 import com.google.common.collect.TreeMultimap;
38 import com.ibm.icu.text.UnicodeSet;
39 
40 public class TestCLDRLocaleCoverage extends TestFmwkPlus {
41     private static StandardCodes sc = StandardCodes.make();
42     private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance();
43     private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo();
44     private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish();
45 
46 
main(String[] args)47     public static void main(String[] args) {
48         new TestCLDRLocaleCoverage().run(args);
49     }
50 
TestLanguageNameCoverage()51     public void TestLanguageNameCoverage() {
52 
53         Set<String> additionsToTranslate = new TreeSet<>(Arrays.asList("zxx", "ceb", "ny", "co", "ht", "hmn", "la", "sm", "st", "sa", "mul"));
54 
55         Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language);
56         Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create());
57         Set<String> regular = (Set<String>) statusToLang.get(Status.regular);
58         Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build();
59         Set<String> valid = validity.keySet();
60 
61         Factory factory = CLDRCONFIG.getCldrFactory();
62         Set<String> mainLocales = new LinkedHashSet<>();
63         LanguageTagParser ltp = new LanguageTagParser();
64         for (String locale : factory.getAvailableLanguages()) {
65             String language = ltp.set(locale).getLanguage();
66             if (language.equals("root")) language = "und";
67             mainLocales.add(language);
68         }
69         mainLocales = ImmutableSet.copyOf(mainLocales);
70         Set<String> localesForNames = new TreeSet<>();
71         localesForNames.addAll(mainLocales);
72         localesForNames.addAll(additionsToTranslate);
73         localesForNames = ImmutableSet.copyOf(localesForNames);
74 
75         assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames);
76 
77         CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und");
78         Multimap<Level, String> levelToLanguage = TreeMultimap.create();
79         for (String locale : valid) {
80             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale);
81             Level level = coverageLeveler.getLevel(path);
82             levelToLanguage.put(level, locale);
83         }
84 
85         Set<String> coverageLocales = new TreeSet<>();
86         for (Level level : Level.values()) {
87             if (level == Level.COMPREHENSIVE) {
88                 continue;
89             }
90             //assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level));
91             coverageLocales.addAll(levelToLanguage.get(level));
92         }
93 
94         // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes.
95         // Usually a problem with coverage.
96         boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales);
97         showRegex |= !assertContains("coverageLocales.containsAll(localesForNames)", coverageLocales, localesForNames);
98         if (showRegex || true) {
99             String simplePattern = MinimizeRegex.simplePattern(localesForNames);
100             warnln("Plain Regex for coverage:\n" + simplePattern);
101             warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]")));
102         }
103 
104         coverageLocales.addAll(SDI.getCLDRLanguageCodes());
105 
106         Map<String,Integer> official1M = getOfficial1M();
107         Set<String> official1MSet = new TreeSet<>();
108         for (String locale : official1M.keySet()) {
109             if (!localesForNames.contains(locale)) {
110                 official1MSet.add(locale);
111             }
112         }
113         warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet);
114 
115 
116 //        assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales);
117 //        assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales);
118 
119         coverageLocales.removeAll(mainLocales);
120         coverageLocales.removeAll(additionsToTranslate);
121 
122         for (String locale : localesForNames) {
123             logln("\n" + locale + "\t" + ENGLISH.getName(locale));
124         }
125 
126         logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder()));
127         logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder()));
128         logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder()));
129     }
130 
getOfficial1M()131     private Map<String,Integer> getOfficial1M() {
132         Counter<String> counter = new Counter<>();
133         for (String region : SDI.getTerritoriesWithPopulationData()) {
134             for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) {
135                 PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region);
136                 OfficialStatus status = popData.getOfficialStatus();
137                 if (status == OfficialStatus.unknown) {
138                     continue;
139                 }
140                 // we only care about names, so drop scripts
141                 int underbar = language.indexOf('_');
142                 if (underbar >= 0) {
143                     language = language.substring(0, underbar);
144                 }
145                 counter.add(language, (int) popData.getLiteratePopulation());
146             }
147         }
148         Map<String,Integer> result = new TreeMap<>();
149         for (String language : counter.keySet()) {
150             long litPop = counter.get(language);
151             if (litPop >= 1_000_000) {
152                 result.put(language, (int)litPop);
153             }
154 
155         }
156         return ImmutableMap.copyOf(result);
157     }
158 
composeList(Iterable<String> source, String separator, StringBuilder result)159     static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) {
160         String prefix = null;
161         for (String item : source) {
162             if (prefix == null || !item.startsWith(prefix)) {
163                 result.append(separator);
164                 prefix = item.substring(0,1); // only ascii
165             } else {
166                 result.append(' ');
167             }
168             result.append(item);
169         }
170         return result;
171     }
172 
assertContains(String title, Collection<String> set, Collection<String> subset)173     private boolean assertContains(String title, Collection<String> set, Collection<String> subset) {
174         boolean result = set.containsAll(subset);
175         if (!result) {
176             Set<String> temp = new LinkedHashSet<>(subset);
177             temp.removeAll(set);
178             Set<String> temp2 = new TreeSet<>();
179             for (String locale : temp) {
180                 temp2.add(locale + "\t" + ENGLISH.getName(locale));
181             }
182             warnln("Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2));
183         }
184         assertTrue(title, result);
185         return result;
186     }
187 
188     /**
189      * Test whether there are any locales for the organization CLDR
190      */
TestCLDROrganizationPresence()191     public void TestCLDROrganizationPresence() {
192         Set<String> cldrLocales = sc.getLocaleCoverageLocales(
193             Organization.cldr, EnumSet.of(Level.MODERN));
194         assertNotNull("Expected CLDR modern locales not to be null",
195             cldrLocales);
196         assertTrue("Expected locales for CLDR, but found none.",
197             cldrLocales != null && !cldrLocales.isEmpty());
198     }
199 
200     /**
201      * Tests that cldr is a superset.
202      */
TestCldrSuperset()203     public void TestCldrSuperset() {
204         checkCldrLocales(Organization.apple, ERR);
205         checkCldrLocales(Organization.google, ERR);
206         checkCldrLocales(Organization.microsoft, WARN);
207     }
208 
209     static Set<String> SKIP_SUPERSET = ImmutableSet.of("to", "fo");
210 
checkCldrLocales(Organization organization, int warningLevel)211     private void checkCldrLocales(Organization organization, int warningLevel) {
212         // use a union, so that items can be higher
213         EnumSet<Level> modernModerate = EnumSet.of(Level.MODERATE, Level.MODERN);
214 
215         Set<String> orgLocalesModerate = sc.getLocaleCoverageLocales(organization, modernModerate);
216         Set<String> cldrLocalesModerate = sc.getLocaleCoverageLocales(Organization.cldr, modernModerate);
217         Set<String> failures = checkCldrLocalesSuperset(modernModerate, cldrLocalesModerate, organization, orgLocalesModerate, warningLevel,
218             SKIP_SUPERSET);
219 
220         EnumSet<Level> modernSet = EnumSet.of(Level.MODERN);
221         Set<String> orgLocalesModern = sc.getLocaleCoverageLocales(organization, modernSet);
222         Set<String> cldrLocalesModern = sc.getLocaleCoverageLocales(Organization.cldr, modernSet);
223         failures = new HashSet<>(failures);
224         failures.addAll(SKIP_SUPERSET);
225         checkCldrLocalesSuperset(modernSet, cldrLocalesModern, organization, orgLocalesModern, warningLevel, failures);
226     }
227 
checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel, Set<String> skip)228     private Set<String> checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel,
229         Set<String> skip) {
230         if (!cldrLocales.containsAll(orgLocales)) {
231             Set<String> diff2 = new LinkedHashSet<>(Sets.difference(orgLocales, cldrLocales));
232             diff2.removeAll(skip);
233             if (!diff2.isEmpty()) {
234                 String diffString = diff2.toString();
235                 String levelString = Joiner.on("+").join(level);
236                 for (String localeId : diff2) {
237                     diffString += "\n\t" + localeId + "\t" + CLDRConfig.getInstance().getEnglish().getName(localeId);
238                 }
239                 msg("The following " + organization.displayName + " " + levelString + " locales were absent from the "
240                     + Organization.cldr.displayName + " " + levelString + " locales:" + diffString,
241                     warningLevel, true, true);
242             }
243             return diff2;
244         }
245         return Collections.EMPTY_SET;
246     }
247 }
248