1 package org.unicode.cldr.unittest; 2 3 import java.util.Arrays; 4 import java.util.Collection; 5 import java.util.Collections; 6 import java.util.EnumSet; 7 import java.util.HashSet; 8 import java.util.LinkedHashSet; 9 import java.util.Map; 10 import java.util.Set; 11 import java.util.TreeMap; 12 import java.util.TreeSet; 13 14 import org.unicode.cldr.test.CoverageLevel2; 15 import org.unicode.cldr.tool.MinimizeRegex; 16 import org.unicode.cldr.util.CLDRConfig; 17 import org.unicode.cldr.util.CLDRFile; 18 import org.unicode.cldr.util.Counter; 19 import org.unicode.cldr.util.Factory; 20 import org.unicode.cldr.util.LanguageTagParser; 21 import org.unicode.cldr.util.Level; 22 import org.unicode.cldr.util.Organization; 23 import org.unicode.cldr.util.StandardCodes; 24 import org.unicode.cldr.util.StandardCodes.LstrType; 25 import org.unicode.cldr.util.SupplementalDataInfo; 26 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 27 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 28 import org.unicode.cldr.util.Validity; 29 import org.unicode.cldr.util.Validity.Status; 30 31 import com.google.common.base.Joiner; 32 import com.google.common.collect.ImmutableMap; 33 import com.google.common.collect.ImmutableSet; 34 import com.google.common.collect.Multimap; 35 import com.google.common.collect.Multimaps; 36 import com.google.common.collect.Sets; 37 import com.google.common.collect.TreeMultimap; 38 import com.ibm.icu.text.UnicodeSet; 39 40 public class TestCLDRLocaleCoverage extends TestFmwkPlus { 41 private static StandardCodes sc = StandardCodes.make(); 42 private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance(); 43 private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo(); 44 private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish(); 45 46 main(String[] args)47 public static void main(String[] args) { 48 new TestCLDRLocaleCoverage().run(args); 49 } 50 TestLanguageNameCoverage()51 public void TestLanguageNameCoverage() { 52 53 Set<String> additionsToTranslate = new TreeSet<>(Arrays.asList("zxx", "ceb", "ny", "co", "ht", "hmn", "la", "sm", "st", "sa", "mul")); 54 55 Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language); 56 Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create()); 57 Set<String> regular = (Set<String>) statusToLang.get(Status.regular); 58 Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build(); 59 Set<String> valid = validity.keySet(); 60 61 Factory factory = CLDRCONFIG.getCldrFactory(); 62 Set<String> mainLocales = new LinkedHashSet<>(); 63 LanguageTagParser ltp = new LanguageTagParser(); 64 for (String locale : factory.getAvailableLanguages()) { 65 String language = ltp.set(locale).getLanguage(); 66 if (language.equals("root")) language = "und"; 67 mainLocales.add(language); 68 } 69 mainLocales = ImmutableSet.copyOf(mainLocales); 70 Set<String> localesForNames = new TreeSet<>(); 71 localesForNames.addAll(mainLocales); 72 localesForNames.addAll(additionsToTranslate); 73 localesForNames = ImmutableSet.copyOf(localesForNames); 74 75 assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames); 76 77 CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und"); 78 Multimap<Level, String> levelToLanguage = TreeMultimap.create(); 79 for (String locale : valid) { 80 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale); 81 Level level = coverageLeveler.getLevel(path); 82 levelToLanguage.put(level, locale); 83 } 84 85 Set<String> coverageLocales = new TreeSet<>(); 86 for (Level level : Level.values()) { 87 if (level == Level.COMPREHENSIVE) { 88 continue; 89 } 90 //assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level)); 91 coverageLocales.addAll(levelToLanguage.get(level)); 92 } 93 94 // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes. 95 // Usually a problem with coverage. 96 boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales); 97 showRegex |= !assertContains("coverageLocales.containsAll(localesForNames)", coverageLocales, localesForNames); 98 if (showRegex || true) { 99 String simplePattern = MinimizeRegex.simplePattern(localesForNames); 100 warnln("Plain Regex for coverage:\n" + simplePattern); 101 warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]"))); 102 } 103 104 coverageLocales.addAll(SDI.getCLDRLanguageCodes()); 105 106 Map<String,Integer> official1M = getOfficial1M(); 107 Set<String> official1MSet = new TreeSet<>(); 108 for (String locale : official1M.keySet()) { 109 if (!localesForNames.contains(locale)) { 110 official1MSet.add(locale); 111 } 112 } 113 warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet); 114 115 116 // assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales); 117 // assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales); 118 119 coverageLocales.removeAll(mainLocales); 120 coverageLocales.removeAll(additionsToTranslate); 121 122 for (String locale : localesForNames) { 123 logln("\n" + locale + "\t" + ENGLISH.getName(locale)); 124 } 125 126 logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder())); 127 logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder())); 128 logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder())); 129 } 130 getOfficial1M()131 private Map<String,Integer> getOfficial1M() { 132 Counter<String> counter = new Counter<>(); 133 for (String region : SDI.getTerritoriesWithPopulationData()) { 134 for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) { 135 PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region); 136 OfficialStatus status = popData.getOfficialStatus(); 137 if (status == OfficialStatus.unknown) { 138 continue; 139 } 140 // we only care about names, so drop scripts 141 int underbar = language.indexOf('_'); 142 if (underbar >= 0) { 143 language = language.substring(0, underbar); 144 } 145 counter.add(language, (int) popData.getLiteratePopulation()); 146 } 147 } 148 Map<String,Integer> result = new TreeMap<>(); 149 for (String language : counter.keySet()) { 150 long litPop = counter.get(language); 151 if (litPop >= 1_000_000) { 152 result.put(language, (int)litPop); 153 } 154 155 } 156 return ImmutableMap.copyOf(result); 157 } 158 composeList(Iterable<String> source, String separator, StringBuilder result)159 static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) { 160 String prefix = null; 161 for (String item : source) { 162 if (prefix == null || !item.startsWith(prefix)) { 163 result.append(separator); 164 prefix = item.substring(0,1); // only ascii 165 } else { 166 result.append(' '); 167 } 168 result.append(item); 169 } 170 return result; 171 } 172 assertContains(String title, Collection<String> set, Collection<String> subset)173 private boolean assertContains(String title, Collection<String> set, Collection<String> subset) { 174 boolean result = set.containsAll(subset); 175 if (!result) { 176 Set<String> temp = new LinkedHashSet<>(subset); 177 temp.removeAll(set); 178 Set<String> temp2 = new TreeSet<>(); 179 for (String locale : temp) { 180 temp2.add(locale + "\t" + ENGLISH.getName(locale)); 181 } 182 warnln("Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2)); 183 } 184 assertTrue(title, result); 185 return result; 186 } 187 188 /** 189 * Test whether there are any locales for the organization CLDR 190 */ TestCLDROrganizationPresence()191 public void TestCLDROrganizationPresence() { 192 Set<String> cldrLocales = sc.getLocaleCoverageLocales( 193 Organization.cldr, EnumSet.of(Level.MODERN)); 194 assertNotNull("Expected CLDR modern locales not to be null", 195 cldrLocales); 196 assertTrue("Expected locales for CLDR, but found none.", 197 cldrLocales != null && !cldrLocales.isEmpty()); 198 } 199 200 /** 201 * Tests that cldr is a superset. 202 */ TestCldrSuperset()203 public void TestCldrSuperset() { 204 checkCldrLocales(Organization.apple, ERR); 205 checkCldrLocales(Organization.google, ERR); 206 checkCldrLocales(Organization.microsoft, WARN); 207 } 208 209 static Set<String> SKIP_SUPERSET = ImmutableSet.of("to", "fo"); 210 checkCldrLocales(Organization organization, int warningLevel)211 private void checkCldrLocales(Organization organization, int warningLevel) { 212 // use a union, so that items can be higher 213 EnumSet<Level> modernModerate = EnumSet.of(Level.MODERATE, Level.MODERN); 214 215 Set<String> orgLocalesModerate = sc.getLocaleCoverageLocales(organization, modernModerate); 216 Set<String> cldrLocalesModerate = sc.getLocaleCoverageLocales(Organization.cldr, modernModerate); 217 Set<String> failures = checkCldrLocalesSuperset(modernModerate, cldrLocalesModerate, organization, orgLocalesModerate, warningLevel, 218 SKIP_SUPERSET); 219 220 EnumSet<Level> modernSet = EnumSet.of(Level.MODERN); 221 Set<String> orgLocalesModern = sc.getLocaleCoverageLocales(organization, modernSet); 222 Set<String> cldrLocalesModern = sc.getLocaleCoverageLocales(Organization.cldr, modernSet); 223 failures = new HashSet<>(failures); 224 failures.addAll(SKIP_SUPERSET); 225 checkCldrLocalesSuperset(modernSet, cldrLocalesModern, organization, orgLocalesModern, warningLevel, failures); 226 } 227 checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel, Set<String> skip)228 private Set<String> checkCldrLocalesSuperset(Set<Level> level, Set<String> cldrLocales, Organization organization, Set<String> orgLocales, int warningLevel, 229 Set<String> skip) { 230 if (!cldrLocales.containsAll(orgLocales)) { 231 Set<String> diff2 = new LinkedHashSet<>(Sets.difference(orgLocales, cldrLocales)); 232 diff2.removeAll(skip); 233 if (!diff2.isEmpty()) { 234 String diffString = diff2.toString(); 235 String levelString = Joiner.on("+").join(level); 236 for (String localeId : diff2) { 237 diffString += "\n\t" + localeId + "\t" + CLDRConfig.getInstance().getEnglish().getName(localeId); 238 } 239 msg("The following " + organization.displayName + " " + levelString + " locales were absent from the " 240 + Organization.cldr.displayName + " " + levelString + " locales:" + diffString, 241 warningLevel, true, true); 242 } 243 return diff2; 244 } 245 return Collections.EMPTY_SET; 246 } 247 } 248