1 package org.unicode.cldr.unittest; 2 3 import java.util.Collection; 4 import java.util.Collections; 5 import java.util.EnumSet; 6 import java.util.LinkedHashSet; 7 import java.util.Map; 8 import java.util.Map.Entry; 9 import java.util.Set; 10 import java.util.TreeMap; 11 import java.util.TreeSet; 12 import java.util.stream.Collectors; 13 14 import org.unicode.cldr.test.CoverageLevel2; 15 import org.unicode.cldr.tool.MinimizeRegex; 16 import org.unicode.cldr.util.CLDRConfig; 17 import org.unicode.cldr.util.CLDRFile; 18 import org.unicode.cldr.util.CLDRLocale; 19 import org.unicode.cldr.util.Counter; 20 import org.unicode.cldr.util.Factory; 21 import org.unicode.cldr.util.LanguageTagParser; 22 import org.unicode.cldr.util.Level; 23 import org.unicode.cldr.util.LocaleIDParser; 24 import org.unicode.cldr.util.Organization; 25 import org.unicode.cldr.util.StandardCodes; 26 import org.unicode.cldr.util.StandardCodes.LstrType; 27 import org.unicode.cldr.util.SupplementalDataInfo; 28 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 29 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 30 import org.unicode.cldr.util.Validity; 31 import org.unicode.cldr.util.Validity.Status; 32 33 import com.google.common.base.Joiner; 34 import com.google.common.collect.ImmutableMap; 35 import com.google.common.collect.ImmutableSet; 36 import com.google.common.collect.ImmutableSortedSet; 37 import com.google.common.collect.Multimap; 38 import com.google.common.collect.Multimaps; 39 import com.google.common.collect.Sets; 40 import com.google.common.collect.TreeMultimap; 41 import com.ibm.icu.text.UnicodeSet; 42 43 public class TestCLDRLocaleCoverage extends TestFmwkPlus { 44 private static StandardCodes sc = StandardCodes.make(); 45 private static final CLDRConfig CLDRCONFIG = CLDRConfig.getInstance(); 46 private static final SupplementalDataInfo SDI = CLDRCONFIG.getSupplementalDataInfo(); 47 private static final CLDRFile ENGLISH = CLDRCONFIG.getEnglish(); 48 49 main(String[] args)50 public static void main(String[] args) { 51 new TestCLDRLocaleCoverage().run(args); 52 } 53 TestLanguageNameCoverage()54 public void TestLanguageNameCoverage() { 55 // mainLocales has the locales in common/main, which is basically the set in attributeValueValidity.xml $language.. 56 // We add in additionsToTranslate below the set in attributeValueValidity.xml $languageExceptions 57 // (both sets are included in SDI.getCLDRLanguageCodes() but we do not use that until later). 58 Set<String> additionsToTranslate = ImmutableSortedSet.of("zxx", "mul", 59 "ab", "ace", "ada", "ady", "ain", "ale", "alt", "an", "anp", "arn", "arp", "ars", "atj", "av", "awa", "ay", 60 "ba", "ban", "bho", "bi", "bin", "bla", "bug", "byn", 61 "cay", "ch", "chk", "chm", "cho", "chp", "chy", "clc", "co", "crg", "crj", "crk", "crl", "crm", "crr", "csw", "cv", 62 "dak", "dar", "dgr", "dv", "dzg", 63 "efi", "eka", 64 "fj", "fon", "frc", 65 "gaa", "gez", "gil", "gn", "gor", "gwi", 66 "hai", "hax", "hil", "hmn", "ht", "hup", "hur", "hz", 67 "iba", "ibb", "ikt", "ilo", "inh", "io", "iu", 68 "jbo", 69 "kac", "kaj", "kbd", "kcg", "kfo", "kha", "kj", "kmb", "kpe", "kr", "krc", "krl", "kru", "kum", "kv", "kwk", 70 "la", "lad", "lez", "li", "lil", "lou", "loz", "lsm", "lua", "lun", "lus", 71 "mad", "mag", "mak", "mdf", "men", "mh", "mic", "min", "moe", "moh", "mos", "mus", "mwl", "myv", 72 "na", "nap", "new", "ng", "nia", "niu", "nog", "nqo", "nr", "nso", "nv", "ny", 73 "oc", "ojb", "ojc", "ojs", "ojw", "oka", 74 "pag", "pam", "pap", "pau", "pqm", 75 "rap", "rar", "rhg", "rup", 76 "sad", "sba", "scn", "sco", "shn", "slh", "sm", "snk", "srn", "ss", "st", "str", "suk", "swb", "syr", 77 "tce", "tem", "tet", "tgx", "tht", "tig", "tlh", "tli", "tn", "tpi", "trv", "ts", "ttm", "tum", "tvl", "ty", "tyv", 78 "udm", "umb", 79 "ve", 80 "wa", "wal", "war", "wuu", 81 "xal", 82 "ybb", 83 "zun", "zza" ); 84 85 warnln("Locale names added for translation; revisit each release:\n" 86 + Joiner.on("\n") 87 .join(additionsToTranslate.stream().map(x -> x + "\t(" + ENGLISH.getName(x) + ")").collect(Collectors.toList()))); 88 89 Map<String, Status> validity = Validity.getInstance().getCodeToStatus(LstrType.language); 90 Multimap<Status, String> statusToLang = Multimaps.invertFrom(Multimaps.forMap(validity), TreeMultimap.create()); 91 Set<String> regular = (Set<String>) statusToLang.get(Status.regular); 92 Set<String> regularPlus = ImmutableSet.<String>builder().addAll(regular).add("und").add("zxx").add("mul").build(); 93 Set<String> valid = validity.keySet(); 94 95 Factory factory = CLDRCONFIG.getCldrFactory(); 96 Set<String> mainLocales = new LinkedHashSet<>(); 97 LanguageTagParser ltp = new LanguageTagParser(); 98 for (String locale : factory.getAvailableLanguages()) { 99 String language = ltp.set(locale).getLanguage(); 100 if (language.equals("root")) language = "und"; 101 mainLocales.add(language); 102 } 103 mainLocales = ImmutableSet.copyOf(mainLocales); 104 Set<String> localesForNames = new TreeSet<>(); 105 localesForNames.addAll(mainLocales); 106 localesForNames.addAll(additionsToTranslate); 107 localesForNames = ImmutableSet.copyOf(localesForNames); 108 109 assertContains("regularPlus.containsAll(mainLocales)", regularPlus, localesForNames); 110 111 CoverageLevel2 coverageLeveler = CoverageLevel2.getInstance("und"); 112 Multimap<Level, String> levelToLanguage = TreeMultimap.create(); 113 for (String locale : valid) { 114 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, locale); 115 Level level = coverageLeveler.getLevel(path); 116 levelToLanguage.put(level, locale); 117 } 118 119 Set<String> coverageLocales = new TreeSet<>(); 120 for (Level level : Level.values()) { 121 if (level == Level.COMPREHENSIVE) { 122 continue; 123 } 124 //assertContains("mainLocales.containsAll(coverage:" + level + ")", localesForNames, levelToLanguage.get(level)); 125 coverageLocales.addAll(levelToLanguage.get(level)); 126 } 127 if (logKnownIssue("CLDR-15888", "modern coverage not yet updated for bgc, raj")) { 128 coverageLocales.add("bgc"); 129 coverageLocales.add("raj"); 130 } 131 132 // If this fails, it is because of a mismatch between coverage and the getCLDRLanguageCodes. 133 // Usually a problem with coverage. 134 boolean showRegex = !assertContains("localesForNames.containsAll(coverageLocales)", localesForNames, coverageLocales); 135 showRegex |= !assertContains("coverageLocales.containsAll(localesForNames) - add to %language80 or lower under coverageLevels.xml?", coverageLocales, localesForNames); 136 if (showRegex || true) { 137 String simplePattern = MinimizeRegex.simplePattern(localesForNames); 138 warnln("Plain Regex for coverage:\n" + simplePattern); 139 warnln("Compact Regex for coverage:\n" + MinimizeRegex.compressWith(localesForNames, new UnicodeSet("[a-z]"))); 140 } 141 142 coverageLocales.addAll(SDI.getCLDRLanguageCodes()); 143 144 Map<String,Integer> official1M = getOfficial1M(); 145 Set<String> official1MSet = new TreeSet<>(); 146 for (String locale : official1M.keySet()) { 147 if (!localesForNames.contains(locale)) { 148 official1MSet.add(locale); 149 } 150 } 151 warnln("Official with 1M+ speakers, need investigation of literacy: " + official1MSet); 152 153 154 // assertContains("sdiLocales contains oldModernLocales", sdiLocales, oldModernLocales); 155 // assertContains("oldModernLocales contains sdiLocales", oldModernLocales, sdiLocales); 156 157 coverageLocales.removeAll(mainLocales); 158 coverageLocales.removeAll(additionsToTranslate); 159 160 for (String locale : localesForNames) { 161 logln("\n" + locale + "\t" + ENGLISH.getName(locale)); 162 } 163 164 logln("\nmainLocales:" + composeList(mainLocales, "\n\t", new StringBuilder())); 165 logln("\nadditionsToTranslate:" + composeList(additionsToTranslate, "\n\t", new StringBuilder())); 166 logln("\noldModernLocales:" + composeList(coverageLocales, "\n\t", new StringBuilder())); 167 } 168 getOfficial1M()169 private Map<String,Integer> getOfficial1M() { 170 Counter<String> counter = new Counter<>(); 171 for (String region : SDI.getTerritoriesWithPopulationData()) { 172 for (String language : SDI.getLanguagesForTerritoryWithPopulationData(region)) { 173 PopulationData popData = SDI.getLanguageAndTerritoryPopulationData(language, region); 174 OfficialStatus status = popData.getOfficialStatus(); 175 if (status == OfficialStatus.unknown) { 176 continue; 177 } 178 // we only care about names, so drop scripts 179 int underbar = language.indexOf('_'); 180 if (underbar >= 0) { 181 language = language.substring(0, underbar); 182 } 183 counter.add(language, (int) popData.getLiteratePopulation()); 184 } 185 } 186 Map<String,Integer> result = new TreeMap<>(); 187 for (String language : counter.keySet()) { 188 long litPop = counter.get(language); 189 if (litPop >= 1_000_000) { 190 result.put(language, (int)litPop); 191 } 192 193 } 194 return ImmutableMap.copyOf(result); 195 } 196 composeList(Iterable<String> source, String separator, StringBuilder result)197 static final StringBuilder composeList(Iterable<String> source, String separator, StringBuilder result) { 198 String prefix = null; 199 for (String item : source) { 200 if (prefix == null || !item.startsWith(prefix)) { 201 result.append(separator); 202 prefix = item.substring(0,1); // only ascii 203 } else { 204 result.append(' '); 205 } 206 result.append(item); 207 } 208 return result; 209 } 210 assertContains(String title, Collection<String> set, Collection<String> subset)211 private boolean assertContains(String title, Collection<String> set, Collection<String> subset) { 212 boolean result = set.containsAll(subset); 213 if (!result) { 214 Set<String> temp = new LinkedHashSet<>(subset); 215 temp.removeAll(set); 216 Set<String> temp2 = new TreeSet<>(); 217 for (String locale : temp) { 218 temp2.add(locale + "\t" + ENGLISH.getName(locale)); 219 } 220 errln(title + ": Missing:\t" + temp.size() + "\n\t" + Joiner.on("\n\t").join(temp2)); 221 } 222 return result; 223 } 224 225 /** 226 * Test whether there are any locales for the organization CLDR 227 */ TestCLDROrganizationPresence()228 public void TestCLDROrganizationPresence() { 229 Set<String> cldrLocales = sc.getLocaleCoverageLocales( 230 Organization.cldr, EnumSet.of(Level.MODERN)); 231 assertNotNull("Expected CLDR modern locales not to be null", 232 cldrLocales); 233 assertTrue("Expected locales for CLDR, but found none.", 234 cldrLocales != null && !cldrLocales.isEmpty()); 235 } 236 237 /** 238 * Tests that cldr+special is a superset of the TC locales, with the right levels 239 */ TestCldrSuperset()240 public void TestCldrSuperset() { 241 Map<String, Level> apple = sc.getLocalesToLevelsFor(Organization.apple); 242 Map<String, Level> google = sc.getLocalesToLevelsFor(Organization.google); 243 Map<String, Level> microsoft = sc.getLocalesToLevelsFor(Organization.microsoft); 244 Map<String, Level> special = sc.getLocalesToLevelsFor(Organization.special); 245 246 Map<String, Level> cldr = sc.getLocalesToLevelsFor(Organization.cldr); 247 248 // check that the cldr locales (+ special) have the max level of the TC locales 249 250 for (Entry<String, Level> entry : cldr.entrySet()) { 251 String locale = entry.getKey(); 252 Level cldrLevel = entry.getValue(); 253 Level appleLevel = apple.get(locale); 254 Level googleLevel = google.get(locale); 255 Level microsoftLevel = microsoft.get(locale); 256 Level specialLevel = special.get(locale); 257 258 // check the 8 vote count 259 260 int count = getLevelCount(appleLevel) 261 + getLevelCount(googleLevel) 262 + getLevelCount(microsoftLevel) 263 ; 264 int defaultVotes = SupplementalDataInfo.getInstance().getRequiredVotes(CLDRLocale.getInstance(locale), null); 265 assertEquals("8 votes for " + locale + " at " + cldrLevel, count > 2 && cldrLevel.compareTo(Level.MODERN) >= 0, defaultVotes == 8); 266 267 // check the max level 268 269 Level maxLevel = Level.max(appleLevel, googleLevel, microsoftLevel, specialLevel); 270 assertEquals("cldr level = max for " + locale + " (" + ENGLISH.getName(locale) + ")", cldrLevel, maxLevel); 271 } 272 273 // check that the cldr locales include all of the other locale's 274 275 checkCldrContains("cldr", cldr, "apple", apple); 276 checkCldrContains("cldr", cldr, "google", google); 277 checkCldrContains("cldr", cldr, "microsoft", microsoft); 278 checkCldrContains("cldr", cldr, "special", apple); 279 280 // check that special doesn't overlap with TC, except for generated locales 281 282 checkDisjoint("special", special, "apple", apple); 283 checkDisjoint("special", special, "google", google); 284 checkDisjoint("special", special, "microsoft", microsoft); 285 } 286 getLevelCount(Level appleLevel)287 private int getLevelCount(Level appleLevel) { 288 return appleLevel == null ? 0 289 : appleLevel.compareTo(Level.MODERN) >= 0 ? 1 : 0; 290 } 291 292 private static final Set<String> ANY_LOCALE_SET = ImmutableSet.of("*"); 293 private static final Set<String> LOCALE_CONTAINMENT_EXCEPTIONS = ImmutableSet.of( 294 "sr_Latn", // auto-generated 295 "hi", "sr", "yue" // these are inserted by Locales.txt processing TODO don't add to special 296 ); 297 checkCldrContains(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)298 private void checkCldrContains(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other) { 299 assertEquals(firstName + " ⊇ " + otherName, Collections.emptySet(), Sets.difference(Sets.difference(other.keySet(), ANY_LOCALE_SET), first.keySet())); 300 } 301 checkDisjoint(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other)302 private void checkDisjoint(String firstName, Map<String, Level> first, String otherName, Map<String, Level> other) { 303 assertEquals(firstName + " ⩃ " + otherName, Collections.emptySet(), Sets.difference(Sets.intersection(other.keySet(), first.keySet()), LOCALE_CONTAINMENT_EXCEPTIONS)); 304 } 305 TestParentCoverage()306 public void TestParentCoverage() { 307 for (Organization organization : sc.getLocaleCoverageOrganizations()) { 308 if (organization == Organization.special) { 309 continue; 310 } 311 final Map<String, Level> localesToLevels = sc.getLocalesToLevelsFor(organization); 312 for (Entry<String, Level> localeAndLevel : localesToLevels.entrySet()) { 313 String originalLevel = localeAndLevel.getKey(); 314 Level level = localeAndLevel.getValue(); 315 String locale = originalLevel; 316 while (true) { 317 String parent = LocaleIDParser.getParent(locale); 318 if (parent == null || parent.equals("root")) { 319 break; 320 } 321 if (!parent.equals("en_001")) { // en_001 is generated later from en_GB 322 Level parentLevel = localesToLevels.get(parent); 323 assertTrue(organization 324 + "; locale=" + originalLevel 325 + "; level=" + level 326 + "; parent=" + parent 327 + "; level=" + parentLevel, 328 parentLevel != null && parentLevel.compareTo(level) >= 0); 329 } 330 locale = parent; 331 } 332 } 333 } 334 } 335 } 336