1 package org.unicode.cldr.unittest; 2 3 import java.util.Arrays; 4 import java.util.HashSet; 5 import java.util.Map; 6 import java.util.Map.Entry; 7 import java.util.Set; 8 import java.util.TreeMap; 9 import java.util.TreeSet; 10 11 import org.unicode.cldr.draft.ScriptMetadata; 12 import org.unicode.cldr.draft.ScriptMetadata.Info; 13 import org.unicode.cldr.tool.LikelySubtags; 14 import org.unicode.cldr.util.CLDRConfig; 15 import org.unicode.cldr.util.CLDRFile; 16 import org.unicode.cldr.util.ChainedMap; 17 import org.unicode.cldr.util.ChainedMap.M3; 18 import org.unicode.cldr.util.Containment; 19 import org.unicode.cldr.util.LanguageTagParser; 20 import org.unicode.cldr.util.StandardCodes; 21 import org.unicode.cldr.util.SupplementalDataInfo; 22 23 import com.google.common.collect.ImmutableSet; 24 import com.ibm.icu.dev.test.TestFmwk; 25 import com.ibm.icu.lang.UCharacter; 26 import com.ibm.icu.lang.UProperty; 27 import com.ibm.icu.lang.UScript; 28 import com.ibm.icu.text.UnicodeSet; 29 import com.ibm.icu.util.VersionInfo; 30 31 public class LikelySubtagsTest extends TestFmwk { 32 33 private boolean DEBUG = false; 34 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig 35 .getInstance().getSupplementalDataInfo(); 36 static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO 37 .getLikelySubtags(); 38 static final LikelySubtags LIKELY = new LikelySubtags(); 39 main(String[] args)40 public static void main(String[] args) { 41 new LikelySubtagsTest().run(args); 42 } 43 44 static class Tags { 45 final Set<String> languages = new TreeSet<>(); 46 final Set<String> scripts = new TreeSet<>(); 47 final Set<String> regions = new TreeSet<>(); 48 final Set<String> scriptRegion = new TreeSet<>(); 49 final Set<String> languageScript = new TreeSet<>(); 50 final Set<String> languageRegion = new TreeSet<>(); 51 final Set<String> all = new TreeSet<>(); 52 final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap 53 .of(new TreeMap<String, Object>(), 54 new TreeMap<String, Object>(), 55 new TreeMap<String, Object>(), Boolean.class); 56 final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap 57 .of(new TreeMap<String, Object>(), 58 new TreeMap<String, Object>(), Boolean.class); 59 Tags()60 public Tags() { 61 final LanguageTagParser ltp = new LanguageTagParser(); 62 for (Entry<String, String> entry : likely.entrySet()) { 63 add(ltp.set(entry.getKey()), true); 64 add(ltp.set(entry.getValue()), false); 65 } 66 // add unfamiliar script, unfamiliar region 67 for (String lang : languageToScriptToRegions.keySet()) { 68 if (lang.equals("und")) { 69 continue; 70 } 71 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions 72 .get(lang); 73 final Set<String> scriptsFor = scriptToRegion.keySet(); 74 final Set<String> regionsFor = languageToRegions.get(lang) 75 .keySet(); 76 77 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor); 78 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor); 79 80 languageToScriptToRegions.put(lang, firstScriptNotIn, 81 firstRegionNotIn, Boolean.TRUE); 82 // clone for safety before iterating 83 for (String script : new HashSet<>(scriptsFor)) { 84 languageToScriptToRegions.put(lang, script, 85 firstRegionNotIn, Boolean.TRUE); 86 } 87 for (String region : new HashSet<>(regionsFor)) { 88 languageToScriptToRegions.put(lang, firstScriptNotIn, 89 region, Boolean.TRUE); 90 } 91 } 92 93 // System.out.println("all: " + all); 94 // System.out.println("scriptRegion: " + scriptRegion); 95 // System.out.println("languageScript: " + languageScript); 96 // System.out.println("languageRegion: " + languageRegion); 97 } 98 getNonEmptyNotIn(Iterable<T> a, Set<T> b)99 private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) { 100 for (T x : a) { 101 if (!b.contains(x) && !x.toString().isEmpty()) { 102 return x; 103 } 104 } 105 throw new IllegalArgumentException(); 106 } 107 add(LanguageTagParser ltp, boolean source)108 void add(LanguageTagParser ltp, boolean source) { 109 String sourceLanguage = ltp.getLanguage(); 110 String sourceScript = ltp.getScript(); 111 String sourceRegion = ltp.getRegion(); 112 languageToScriptToRegions.put(sourceLanguage, sourceScript, 113 sourceRegion, Boolean.TRUE); 114 languageToScriptToRegions.put(sourceLanguage, sourceScript, "", 115 Boolean.TRUE); 116 languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE); 117 languageToRegions.put(sourceLanguage, "", Boolean.TRUE); 118 if (StandardCodes.isCountry(sourceRegion)) { 119 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion, 120 Boolean.TRUE); 121 languageToRegions.put(sourceLanguage, sourceRegion, 122 Boolean.TRUE); 123 } 124 125 // capture all cases of 2 items 126 if (source) { 127 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) { 128 if (!sourceLanguage.equals("und")) { 129 all.add(ltp.toString()); 130 } else { 131 scriptRegion.add(ltp.toString()); 132 } 133 } else if (!sourceLanguage.equals("und")) { 134 if (!sourceScript.isEmpty()) { 135 languageScript.add(ltp.toString()); 136 } else if (!sourceRegion.isEmpty()) { 137 languageRegion.add(ltp.toString()); 138 } 139 } 140 } 141 languages.add(sourceLanguage); 142 scripts.add(sourceScript); 143 if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) { 144 regions.add(sourceRegion); 145 } 146 } 147 } 148 149 static final Tags TAGS = new Tags(); 150 151 final LanguageTagParser maxLtp = new LanguageTagParser(); 152 final LanguageTagParser sourceLtp = new LanguageTagParser(); 153 154 /** 155 * Return false if we should skip the language 156 * 157 * @param source 158 * @return 159 */ checkAdding(String source)160 public boolean checkAdding(String source) { 161 // if X maps to Y, then adding a field from Y to X will still map to Y 162 // Example: 163 // und_AF => fa_Arab_AF 164 // therefore, the following should also be true: 165 // und_Arab_AF => fa_Arab_AF 166 // fa_AF => fa_Arab_AF 167 // fa_Arab_AF => fa_Arab_AF 168 169 String max = LIKELY.maximize(source); 170 if (!assertNotEquals("Maximize " + source, null, max)) { 171 return source.contains("_"); 172 } 173 sourceLtp.set(source); 174 if (!sourceLtp.getRegion().isEmpty() 175 && !StandardCodes.isCountry(sourceLtp.getRegion())) { 176 return true; 177 } 178 maxLtp.set(max); 179 for (int i = 1; i < 8; ++i) { 180 if ((i & 1) != 0) { 181 if (!sourceLtp.getLanguage().equals("und")) 182 continue; 183 sourceLtp.setLanguage(maxLtp.getLanguage()); 184 } 185 if ((i & 2) != 0) { 186 if (!sourceLtp.getScript().isEmpty()) 187 continue; 188 sourceLtp.setScript(maxLtp.getScript()); 189 } 190 if ((i & 4) != 0) { 191 if (!sourceLtp.getRegion().isEmpty()) 192 continue; 193 sourceLtp.setRegion(maxLtp.getRegion()); 194 } 195 String test = sourceLtp.toString(); 196 final String maximize = LIKELY.maximize(test); 197 if (!max.equals(maximize)) { 198 // max(source) = max, max(test) ≠ max 199 if (!assertEquals(String.format("checkAdding: max(%s)->%s, however max(%s)->", 200 source, max, test), 201 max, maximize)) { 202 // LIKELY.maximize(test); // Could step into this for debugging. 203 } 204 } 205 sourceLtp.set(source); // restore 206 } 207 return true; 208 } 209 TestCompleteness()210 public void TestCompleteness() { 211 // if (logKnownIssue("Cldrbug:7121", 212 // "Problems with likely subtags test")) { 213 // return; 214 // } 215 // checkAdding("und_Bopo"); 216 // checkAdding("und_Brai"); 217 // checkAdding("und_Limb"); 218 // checkAdding("und_Cakm"); 219 // checkAdding("und_Shaw"); 220 221 final LanguageTagParser ltp = new LanguageTagParser(); 222 if (DEBUG) { 223 System.out.println(TAGS.languages.size() + "\t" + TAGS.languages); 224 System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts); 225 System.out.println(TAGS.regions.size() + "\t" + TAGS.regions); 226 } 227 main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) { 228 String language = languageScriptRegion.getKey(); 229 ltp.set(language); // clears script, region 230 for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion 231 .getValue().entrySet()) { 232 String script = scriptRegion.getKey(); 233 ltp.setScript(script); 234 for (String region : scriptRegion.getValue().keySet()) { 235 ltp.setRegion(region); 236 String testTag = ltp.toString(); 237 // System.out.println(testTag); 238 if (!testTag.equals("und_Hmng") && !checkAdding(testTag)) { 239 continue main; 240 } 241 } 242 } 243 } 244 } 245 246 static Set<String> exceptions = new HashSet<>(Arrays.asList("Zyyy", 247 "Zinh", "Zzzz", "Brai", "Cpmn")); // scripts with no default language 248 TestStability()249 public void TestStability() { 250 // when maximized must never change 251 // first get all the subtags 252 // then test all the combinations 253 LanguageTagParser ltp = new LanguageTagParser(); 254 for (Entry<String, String> entry : likely.entrySet()) { 255 ltp.set(entry.getKey()); 256 String sourceLanguage = ltp.getLanguage(); 257 if (sourceLanguage.equals("und")) { 258 sourceLanguage = ""; 259 } 260 String sourceScript = ltp.getScript(); 261 String sourceRegion = ltp.getRegion(); 262 ltp.set(entry.getValue()); 263 String targetLanguage = ltp.getLanguage(); 264 String targetScript = ltp.getScript(); 265 String targetRegion = ltp.getRegion(); 266 if (!sourceLanguage.isEmpty()) { 267 assertEquals("language", sourceLanguage, targetLanguage); 268 } 269 if (!sourceScript.isEmpty()) { 270 assertEquals("script", sourceScript, targetScript); 271 } 272 if (!sourceRegion.isEmpty()) { 273 if (Containment.isLeaf(sourceRegion)) { 274 assertEquals("region", sourceRegion, targetRegion); 275 } 276 } 277 } 278 279 } 280 TestForMissingScriptMetadata()281 public void TestForMissingScriptMetadata() { 282 TreeSet<String> metadataScripts = new TreeSet<>( 283 ScriptMetadata.getScripts()); 284 UnicodeSet current = new UnicodeSet(0, 0x10FFFF); 285 UnicodeSet toRemove = new UnicodeSet(); 286 287 while (!current.isEmpty()) { 288 int ch = current.charAt(0); 289 int script = UScript.getScript(ch); 290 String shortName = UScript.getShortName(script); 291 Info i = ScriptMetadata.getInfo(shortName); 292 if (i == null) { 293 errln("Script Metadata is missing: " + shortName); 294 continue; 295 } 296 if (i.likelyLanguage.equals("und") 297 && !exceptions.contains(shortName)) { 298 errln("Script has no likely language: " + shortName); 299 } 300 toRemove.applyIntPropertyValue(UProperty.SCRIPT, script); 301 current.removeAll(toRemove); 302 metadataScripts.remove(shortName); 303 } 304 metadataScripts 305 .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove 306 // "combo" 307 // scripts 308 if (!metadataScripts.isEmpty()) { 309 // Warning, not error, so that we can add scripts to the script metadata 310 // and later update to the Unicode version that has characters for those scripts. 311 warnln("Script Metadata for characters not in Unicode: " 312 + metadataScripts); 313 } 314 } 315 TestMissingInfoForLanguage()316 public void TestMissingInfoForLanguage() { 317 CLDRFile english = CLDRConfig.getInstance().getEnglish(); 318 319 for (String language : CLDRConfig.getInstance().getCldrFactory() 320 .getAvailableLanguages()) { 321 if (language.contains("_") || language.equals("root")) { 322 continue; 323 } 324 String likelyExpansion = likely.get(language); 325 if (likelyExpansion == null) { 326 errln("Missing likely subtags for: " + language); 327 } else { 328 logln("Likely subtags for " + language + ":\t " + likely); 329 } 330 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 331 String englishName = english.getStringValue(path); 332 if (englishName == null) { 333 errln("Missing English translation for: " + language); 334 } 335 } 336 } 337 TestMissingInfoForRegion()338 public void TestMissingInfoForRegion() { 339 CLDRFile english = CLDRConfig.getInstance().getEnglish(); 340 341 for (String region : StandardCodes.make().getGoodAvailableCodes( 342 "territory")) { 343 String likelyExpansion = likely.get("und_" + region); 344 if (likelyExpansion == null) { 345 if (region.equals("ZZ") || region.equals("001") || region.equals("UN") 346 || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not 347 // container 348 String likelyTag = LikelySubtags.maximize("und_" + region, 349 likely); 350 if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) { 351 errln("Missing likely subtags for region: " + region 352 + "\t" + english.getName("territory", region)); 353 } 354 } else { // container 355 errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): " 356 + region 357 + "\t" 358 + english.getName("territory", region)); 359 } 360 } else { 361 logln("Likely subtags for region: " + region + ":\t " + likely); 362 } 363 String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region); 364 String englishName = english.getStringValue(path); 365 if (englishName == null) { 366 errln("Missing English translation for: " + region); 367 } 368 } 369 } 370 371 static final Set<String> KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS = ImmutableSet.of("Hatr"); 372 TestMissingInfoForScript()373 public void TestMissingInfoForScript() { 374 VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion(); 375 TreeSet<String> sorted = new TreeSet<>( 376 ScriptMetadata.getScripts()); 377 Set<String> exceptions2 = new HashSet<>( 378 Arrays.asList("zh_Hans_CN", "hnj_Hmnp_US", "hnj_Hmng_LA", "iu_Cans_CA")); 379 for (String script : sorted) { 380 if (exceptions.contains(script) || script.equals("Latn") 381 || script.equals("Dsrt")) { 382 // we minimize away und_X, when the code puts in en...US 383 continue; 384 } 385 Info i = ScriptMetadata.getInfo(script); 386 // System.out.println(i); 387 String likelyLanguage = i.likelyLanguage; 388 String originCountry = i.originCountry; 389 String undScript = "und_" + script; 390 String langScript = likelyLanguage + "_" + script + "_"; 391 String likelyExpansion = likely.get(undScript); 392 if (likelyExpansion == null) { 393 if (!KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS.contains(script)) { 394 String msg = "likelySubtags.xml missing language for script (und_" + script 395 + "). Script Metadata suggests that it should be something like:\t " 396 + showOverride(script, originCountry, langScript); 397 if (i.age.compareTo(icuUnicodeVersion) <= 0) { 398 // Error: Missing data for a script in ICU's Unicode version. 399 errln(msg); 400 } else { 401 // Warning: Missing data for a script in a future Unicode version. 402 warnln(msg); 403 } 404 } 405 } else if (!exceptions2.contains(likelyExpansion) 406 && !likelyExpansion.startsWith(langScript)) { 407 // if 408 // (logKnownIssue("Cldrbug:7181","Missing script metadata for " 409 // + script) 410 // && (script.equals("Tfng") || script.equals("Brah"))) { 411 // logln("Wrong likely language for script (und_" + script + 412 // "). Should not be " + likelyExpansion 413 // + ", but something like:\t " + showOverride(script, 414 // originCountry, langScript)); 415 // } else { 416 errln("likelySubtags.xml has wrong language for script (und_" + script 417 + "). Should not be " + likelyExpansion 418 + ", but Script Metadata suggests something like:\t " 419 + showOverride(script, originCountry, langScript)); 420 // } 421 } else { 422 logln("OK: " + undScript + " => " + likelyExpansion); 423 } 424 } 425 /** 426 * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt => 427 * en_Dsrt_US // fix US 428 */ 429 } 430 showOverride(String script, String originCountry, String langScript)431 public String showOverride(String script, String originCountry, 432 String langScript) { 433 return "{\"und_" + script + "\", \"" + langScript + originCountry 434 + "\"},"; 435 } 436 } 437