1 package org.unicode.cldr.unittest; 2 3 import java.util.Arrays; 4 import java.util.Collection; 5 import java.util.Collections; 6 import java.util.EnumSet; 7 import java.util.HashSet; 8 import java.util.LinkedHashSet; 9 import java.util.Map; 10 import java.util.Set; 11 import java.util.TreeMap; 12 import java.util.TreeSet; 13 14 import org.junit.jupiter.api.DisplayNameGenerator.Standard; 15 import org.unicode.cldr.test.CoverageLevel2; 16 import org.unicode.cldr.tool.MinimizeRegex; 17 import org.unicode.cldr.util.CLDRConfig; 18 import org.unicode.cldr.util.CLDRFile; 19 import org.unicode.cldr.util.Counter; 20 import org.unicode.cldr.util.Factory; 21 import org.unicode.cldr.util.LanguageTagParser; 22 import org.unicode.cldr.util.Level; 23 import org.unicode.cldr.util.Organization; 24 import org.unicode.cldr.util.StandardCodes; 25 import org.unicode.cldr.util.StandardCodes.LstrType; 26 import org.unicode.cldr.util.SupplementalDataInfo; 27 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 28 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 29 import org.unicode.cldr.util.Validity; 30 import org.unicode.cldr.util.Validity.Status; 31 32 import com.google.common.base.Joiner; 33 import com.google.common.collect.ImmutableMap; 34 import com.google.common.collect.ImmutableSet; 35 import com.google.common.collect.Multimap; 36 import com.google.common.collect.Multimaps; 37 import com.google.common.collect.Sets; 38 import com.google.common.collect.TreeMultimap; 39 import com.ibm.icu.text.UnicodeSet; 40 41 public class TestCLDRLocaleCoverage extends TestFmwkPlus { 42 private static StandardCodes sc = StandardCodes.make(); 43 private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance(); 44 private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo(); 45 private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish(); 46 47 main(String[] args)48 public static void main(String[] args) { 49 new TestCLDRLocaleCoverage().run(args); 50 } 51 TestLanguageNameCoverage()52 public void TestLanguageNameCoverage() { 53 // not sure why we need rhg here, since it is in seed it should be in mainLocales 54 Set<String> additionsToTranslate = new TreeSet<>(Arrays.asList("zxx", "ceb", "ny", "co", "ht", "hmn", "la", "sm", "st", "sa", "mul", "rhg")); 55 56 Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language); 57 Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create()); 58 Set<String> regular = (Set<String>) statusToLang.get(Status.regular); 59 Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build(); 60 Set<String> valid = validity.keySet(); 61 62 Factory factory = CLDRCONFIG.getCldrFactory(); 63 Set<String> mainLocales = new LinkedHashSet<>(); 64 LanguageTagParser ltp = new LanguageTagParser(); 65 for (String locale : factory.getAvailableLanguages()) { 66 String language = ltp.set(locale).getLanguage(); 67 if (language.equals("root")) language = "und"; 68 mainLocales.add(language); 69 } 70 mainLocales = ImmutableSet.copyOf(mainLocales); 71 Set<String> localesForNames = new TreeSet<>(); 72 localesForNames.addAll(mainLocales); 73 localesForNames.addAll(additionsToTranslate); 74 localesForNames = ImmutableSet.copyOf(localesForNames); 75 76 assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames); 77 78 CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und"); 79 Multimap<Level, String> levelToLanguage = TreeMultimap.create(); 80 for (String locale : valid) { 81 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale); 82 Level level = coverageLeveler.getLevel(path); 83 levelToLanguage.put(level, locale); 84 } 85 86 Set<String> coverageLocales = new TreeSet<>(); 87 for (Level level : Level.values()) { 88 if (level == Level.COMPREHENSIVE) { 89 continue; 90 } 91 //assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level)); 92 coverageLocales.addAll(levelToLanguage.get(level)); 93 } 94 95 // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes. 96 // Usually a problem with coverage. 97 boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales); 98 showRegex |= !assertContains("coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?", coverageLocales, localesForNames); 99 if (showRegex || true) { 100 String simplePattern = MinimizeRegex.simplePattern(localesForNames); 101 warnln("Plain Regex for coverage:\n" + simplePattern); 102 warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]"))); 103 } 104 105 coverageLocales.addAll(SDI.getCLDRLanguageCodes()); 106 107 Map<String,Integer> official1M = getOfficial1M(); 108 Set<String> official1MSet = new TreeSet<>(); 109 for (String locale : official1M.keySet()) { 110 if (!localesForNames.contains(locale)) { 111 official1MSet.add(locale); 112 } 113 } 114 warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet); 115 116 117 // assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales); 118 // assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales); 119 120 coverageLocales.removeAll(mainLocales); 121 coverageLocales.removeAll(additionsToTranslate); 122 123 for (String locale : localesForNames) { 124 logln("\n" + locale + "\t" + ENGLISH.getName(locale)); 125 } 126 127 logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder())); 128 logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder())); 129 logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder())); 130 } 131 getOfficial1M()132 private Map<String,Integer> getOfficial1M() { 133 Counter<String> counter = new Counter<>(); 134 for (String region : SDI.getTerritoriesWithPopulationData()) { 135 for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) { 136 PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region); 137 OfficialStatus status = popData.getOfficialStatus(); 138 if (status == OfficialStatus.unknown) { 139 continue; 140 } 141 // we only care about names, so drop scripts 142 int underbar = language.indexOf('_'); 143 if (underbar >= 0) { 144 language = language.substring(0, underbar); 145 } 146 counter.add(language, (int) popData.getLiteratePopulation()); 147 } 148 } 149 Map<String,Integer> result = new TreeMap<>(); 150 for (String language : counter.keySet()) { 151 long litPop = counter.get(language); 152 if (litPop >= 1_000_000) { 153 result.put(language, (int)litPop); 154 } 155 156 } 157 return ImmutableMap.copyOf(result); 158 } 159 composeList(Iterable<String> source, String separator, StringBuilder result)160 static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) { 161 String prefix = null; 162 for (String item : source) { 163 if (prefix == null || !item.startsWith(prefix)) { 164 result.append(separator); 165 prefix = item.substring(0,1); // only ascii 166 } else { 167 result.append(' '); 168 } 169 result.append(item); 170 } 171 return result; 172 } 173 assertContains(String title, Collection<String> set, Collection<String> subset)174 private boolean assertContains(String title, Collection<String> set, Collection<String> subset) { 175 boolean result = set.containsAll(subset); 176 if (!result) { 177 Set<String> temp = new LinkedHashSet<>(subset); 178 temp.removeAll(set); 179 Set<String> temp2 = new TreeSet<>(); 180 for (String locale : temp) { 181 temp2.add(locale + "\t" + ENGLISH.getName(locale)); 182 } 183 errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2)); 184 } 185 return result; 186 } 187 188 /** 189 * Test whether there are any locales for the organization CLDR 190 */ TestCLDROrganizationPresence()191 public void TestCLDROrganizationPresence() { 192 Set<String> cldrLocales = sc.getLocaleCoverageLocales( 193 Organization.cldr, EnumSet.of(Level.MODERN)); 194 assertNotNull("Expected CLDR modern locales not to be null", 195 cldrLocales); 196 assertTrue("Expected locales for CLDR, but found none.", 197 cldrLocales != null && !cldrLocales.isEmpty()); 198 } 199 200 /** 201 * Tests that cldr is a superset. 202 */ TestCldrSuperset()203 public void TestCldrSuperset() { 204 checkCldrLocales(Organization.apple, ERR); 205 checkCldrLocales(Organization.google, ERR); 206 checkCldrLocales(Organization.microsoft, WARN); 207 } 208 209 /** 210 * Set of locales which are _excluded_ from Cldr-is-a-superset tests 211 */ 212 static Set<String> SKIP_SUPERSET = ImmutableSet.of( 213 // Microsoft locales? 214 "to", "fo", 215 // Apple locales 216 "aa", "bo", "cad", "kl", "kok", "lb", "pcm", "tg", "tt", 217 // Google locales 218 "aa", "br", "gd", "ia", "kea", "nqo", "oc", "sc", 219 // Skip "*" 220 StandardCodes.ALL_LOCALES 221 ); 222 checkCldrLocales(Organization organization, int warningLevel)223 private void checkCldrLocales(Organization organization, int warningLevel) { 224 // use a union, so that items can be higher 225 EnumSet<Level> modernModerate = EnumSet.of(Level.MODERATE, Level.MODERN); 226 227 Set<String> orgLocalesModerate = sc.getLocaleCoverageLocales(organization, modernModerate); 228 Set<String> cldrLocalesModerate = sc.getLocaleCoverageLocales(Organization.cldr, modernModerate); 229 Set<String> failures = checkCldrLocalesSuperset(modernModerate, cldrLocalesModerate, organization, orgLocalesModerate, warningLevel, 230 SKIP_SUPERSET); 231 232 EnumSet<Level> modernSet = EnumSet.of(Level.MODERN); 233 Set<String> orgLocalesModern = sc.getLocaleCoverageLocales(organization, modernSet); 234 Set<String> cldrLocalesModern = sc.getLocaleCoverageLocales(Organization.cldr, modernSet); 235 failures = new HashSet<>(failures); 236 failures.addAll(SKIP_SUPERSET); 237 checkCldrLocalesSuperset(modernSet, cldrLocalesModern, organization, orgLocalesModern, warningLevel, failures); 238 } 239 checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel, Set<String> skip)240 private Set<String> checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel, 241 Set<String> skip) { 242 if (!cldrLocales.containsAll(orgLocales)) { 243 Set<String> diff2 = new LinkedHashSet<>(Sets.difference(orgLocales, cldrLocales)); 244 diff2.removeAll(skip); 245 if (!diff2.isEmpty()) { 246 String diffString = diff2.toString(); 247 String levelString = Joiner.on("+").join(level); 248 for (String localeId : diff2) { 249 diffString += "\n\t" + localeId + "\t" + CLDRConfig.getInstance().getEnglish().getName(localeId); 250 } 251 msg("The following " + organization.displayName + " " + levelString + " locales were absent from the " 252 + Organization.cldr.displayName + " " + levelString + " locales:" + diffString, 253 warningLevel, true, true); 254 } 255 return diff2; 256 } 257 return Collections.emptySet(); 258 } 259 } 260