• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.unittest;
2 
3 import java.util.Arrays;
4 import java.util.HashSet;
5 import java.util.Map;
6 import java.util.Map.Entry;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import java.util.TreeSet;
10 
11 import org.unicode.cldr.draft.ScriptMetadata;
12 import org.unicode.cldr.draft.ScriptMetadata.Info;
13 import org.unicode.cldr.tool.LikelySubtags;
14 import org.unicode.cldr.util.CLDRConfig;
15 import org.unicode.cldr.util.CLDRFile;
16 import org.unicode.cldr.util.ChainedMap;
17 import org.unicode.cldr.util.ChainedMap.M3;
18 import org.unicode.cldr.util.Containment;
19 import org.unicode.cldr.util.LanguageTagParser;
20 import org.unicode.cldr.util.StandardCodes;
21 import org.unicode.cldr.util.SupplementalDataInfo;
22 
23 import com.google.common.collect.ImmutableSet;
24 import com.ibm.icu.dev.test.TestFmwk;
25 import com.ibm.icu.lang.UCharacter;
26 import com.ibm.icu.lang.UProperty;
27 import com.ibm.icu.lang.UScript;
28 import com.ibm.icu.text.UnicodeSet;
29 import com.ibm.icu.util.VersionInfo;
30 
31 public class LikelySubtagsTest extends TestFmwk {
32 
33     private boolean DEBUG = false;
34     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig
35         .getInstance().getSupplementalDataInfo();
36     static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO
37         .getLikelySubtags();
38     static final LikelySubtags LIKELY = new LikelySubtags();
39 
main(String[] args)40     public static void main(String[] args) {
41         new LikelySubtagsTest().run(args);
42     }
43 
44     static class Tags {
45         final Set<String> languages = new TreeSet<>();
46         final Set<String> scripts = new TreeSet<>();
47         final Set<String> regions = new TreeSet<>();
48         final Set<String> scriptRegion = new TreeSet<>();
49         final Set<String> languageScript = new TreeSet<>();
50         final Set<String> languageRegion = new TreeSet<>();
51         final Set<String> all = new TreeSet<>();
52         final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap
53             .of(new TreeMap<String, Object>(),
54                 new TreeMap<String, Object>(),
55                 new TreeMap<String, Object>(), Boolean.class);
56         final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap
57             .of(new TreeMap<String, Object>(),
58                 new TreeMap<String, Object>(), Boolean.class);
59 
Tags()60         public Tags() {
61             final LanguageTagParser ltp = new LanguageTagParser();
62             for (Entry<String, String> entry : likely.entrySet()) {
63                 add(ltp.set(entry.getKey()), true);
64                 add(ltp.set(entry.getValue()), false);
65             }
66             // add unfamiliar script, unfamiliar region
67             for (String lang : languageToScriptToRegions.keySet()) {
68                 if (lang.equals("und")) {
69                     continue;
70                 }
71                 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions
72                     .get(lang);
73                 final Set<String> scriptsFor = scriptToRegion.keySet();
74                 final Set<String> regionsFor = languageToRegions.get(lang)
75                     .keySet();
76 
77                 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor);
78                 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor);
79 
80                 languageToScriptToRegions.put(lang, firstScriptNotIn,
81                     firstRegionNotIn, Boolean.TRUE);
82                 // clone for safety before iterating
83                 for (String script : new HashSet<>(scriptsFor)) {
84                     languageToScriptToRegions.put(lang, script,
85                         firstRegionNotIn, Boolean.TRUE);
86                 }
87                 for (String region : new HashSet<>(regionsFor)) {
88                     languageToScriptToRegions.put(lang, firstScriptNotIn,
89                         region, Boolean.TRUE);
90                 }
91             }
92 
93             // System.out.println("all: " + all);
94             // System.out.println("scriptRegion: " + scriptRegion);
95             // System.out.println("languageScript: " + languageScript);
96             // System.out.println("languageRegion: " + languageRegion);
97         }
98 
getNonEmptyNotIn(Iterable<T> a, Set<T> b)99         private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) {
100             for (T x : a) {
101                 if (!b.contains(x) && !x.toString().isEmpty()) {
102                     return x;
103                 }
104             }
105             throw new IllegalArgumentException();
106         }
107 
add(LanguageTagParser ltp, boolean source)108         void add(LanguageTagParser ltp, boolean source) {
109             String sourceLanguage = ltp.getLanguage();
110             String sourceScript = ltp.getScript();
111             String sourceRegion = ltp.getRegion();
112             languageToScriptToRegions.put(sourceLanguage, sourceScript,
113                 sourceRegion, Boolean.TRUE);
114             languageToScriptToRegions.put(sourceLanguage, sourceScript, "",
115                 Boolean.TRUE);
116             languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE);
117             languageToRegions.put(sourceLanguage, "", Boolean.TRUE);
118             if (StandardCodes.isCountry(sourceRegion)) {
119                 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion,
120                     Boolean.TRUE);
121                 languageToRegions.put(sourceLanguage, sourceRegion,
122                     Boolean.TRUE);
123             }
124 
125             // capture all cases of 2 items
126             if (source) {
127                 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) {
128                     if (!sourceLanguage.equals("und")) {
129                         all.add(ltp.toString());
130                     } else {
131                         scriptRegion.add(ltp.toString());
132                     }
133                 } else if (!sourceLanguage.equals("und")) {
134                     if (!sourceScript.isEmpty()) {
135                         languageScript.add(ltp.toString());
136                     } else if (!sourceRegion.isEmpty()) {
137                         languageRegion.add(ltp.toString());
138                     }
139                 }
140             }
141             languages.add(sourceLanguage);
142             scripts.add(sourceScript);
143             if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) {
144                 regions.add(sourceRegion);
145             }
146         }
147     }
148 
149     static final Tags TAGS = new Tags();
150 
151     final LanguageTagParser maxLtp = new LanguageTagParser();
152     final LanguageTagParser sourceLtp = new LanguageTagParser();
153 
154     /**
155      * Return false if we should skip the language
156      *
157      * @param source
158      * @return
159      */
checkAdding(String source)160     public boolean checkAdding(String source) {
161         // if X maps to Y, then adding a field from Y to X will still map to Y
162         // Example:
163         // und_AF => fa_Arab_AF
164         // therefore, the following should also be true:
165         // und_Arab_AF => fa_Arab_AF
166         // fa_AF => fa_Arab_AF
167         // fa_Arab_AF => fa_Arab_AF
168 
169         String max = LIKELY.maximize(source);
170         if (!assertNotEquals("Maximize " + source, null, max)) {
171             return source.contains("_");
172         }
173         sourceLtp.set(source);
174         if (!sourceLtp.getRegion().isEmpty()
175             && !StandardCodes.isCountry(sourceLtp.getRegion())) {
176             return true;
177         }
178         maxLtp.set(max);
179         for (int i = 1; i < 8; ++i) {
180             if ((i & 1) != 0) {
181                 if (!sourceLtp.getLanguage().equals("und"))
182                     continue;
183                 sourceLtp.setLanguage(maxLtp.getLanguage());
184             }
185             if ((i & 2) != 0) {
186                 if (!sourceLtp.getScript().isEmpty())
187                     continue;
188                 sourceLtp.setScript(maxLtp.getScript());
189             }
190             if ((i & 4) != 0) {
191                 if (!sourceLtp.getRegion().isEmpty())
192                     continue;
193                 sourceLtp.setRegion(maxLtp.getRegion());
194             }
195             String test = sourceLtp.toString();
196             final String maximize = LIKELY.maximize(test);
197             if (!max.equals(maximize)) {
198                 // max(source) = max, max(test) ≠ max
199                 if (!assertEquals(String.format("checkAdding: max(%s)->%s, however max(%s)->",
200                         source, max, test),
201                     max, maximize)) {
202                     // LIKELY.maximize(test); // Could step into this for debugging.
203                 }
204             }
205             sourceLtp.set(source); // restore
206         }
207         return true;
208     }
209 
TestCompleteness()210     public void TestCompleteness() {
211         // if (logKnownIssue("Cldrbug:7121",
212         // "Problems with likely subtags test")) {
213         // return;
214         // }
215         // checkAdding("und_Bopo");
216         // checkAdding("und_Brai");
217         // checkAdding("und_Limb");
218         // checkAdding("und_Cakm");
219         // checkAdding("und_Shaw");
220 
221         final LanguageTagParser ltp = new LanguageTagParser();
222         if (DEBUG) {
223             System.out.println(TAGS.languages.size() + "\t" + TAGS.languages);
224             System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts);
225             System.out.println(TAGS.regions.size() + "\t" + TAGS.regions);
226         }
227         main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) {
228             String language = languageScriptRegion.getKey();
229             ltp.set(language); // clears script, region
230             for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion
231                 .getValue().entrySet()) {
232                 String script = scriptRegion.getKey();
233                 ltp.setScript(script);
234                 for (String region : scriptRegion.getValue().keySet()) {
235                     ltp.setRegion(region);
236                     String testTag = ltp.toString();
237                     // System.out.println(testTag);
238                     if (!testTag.equals("und_Hmng") && !checkAdding(testTag)) {
239                         continue main;
240                     }
241                 }
242             }
243         }
244     }
245 
246     static Set<String> exceptions = new HashSet<>(Arrays.asList("Zyyy",
247         "Zinh", "Zzzz", "Brai", "Cpmn")); // scripts with no default language
248 
TestStability()249     public void TestStability() {
250         // when maximized must never change
251         // first get all the subtags
252         // then test all the combinations
253         LanguageTagParser ltp = new LanguageTagParser();
254         for (Entry<String, String> entry : likely.entrySet()) {
255             ltp.set(entry.getKey());
256             String sourceLanguage = ltp.getLanguage();
257             if (sourceLanguage.equals("und")) {
258                 sourceLanguage = "";
259             }
260             String sourceScript = ltp.getScript();
261             String sourceRegion = ltp.getRegion();
262             ltp.set(entry.getValue());
263             String targetLanguage = ltp.getLanguage();
264             String targetScript = ltp.getScript();
265             String targetRegion = ltp.getRegion();
266             if (!sourceLanguage.isEmpty()) {
267                 assertEquals("language", sourceLanguage, targetLanguage);
268             }
269             if (!sourceScript.isEmpty()) {
270                 assertEquals("script", sourceScript, targetScript);
271             }
272             if (!sourceRegion.isEmpty()) {
273                 if (Containment.isLeaf(sourceRegion)) {
274                     assertEquals("region", sourceRegion, targetRegion);
275                 }
276             }
277         }
278 
279     }
280 
TestForMissingScriptMetadata()281     public void TestForMissingScriptMetadata() {
282         TreeSet<String> metadataScripts = new TreeSet<>(
283             ScriptMetadata.getScripts());
284         UnicodeSet current = new UnicodeSet(0, 0x10FFFF);
285         UnicodeSet toRemove = new UnicodeSet();
286 
287         while (!current.isEmpty()) {
288             int ch = current.charAt(0);
289             int script = UScript.getScript(ch);
290             String shortName = UScript.getShortName(script);
291             Info i = ScriptMetadata.getInfo(shortName);
292             if (i == null) {
293                 errln("Script Metadata is missing: " + shortName);
294                 continue;
295             }
296             if (i.likelyLanguage.equals("und")
297                 && !exceptions.contains(shortName)) {
298                 errln("Script has no likely language: " + shortName);
299             }
300             toRemove.applyIntPropertyValue(UProperty.SCRIPT, script);
301             current.removeAll(toRemove);
302             metadataScripts.remove(shortName);
303         }
304         metadataScripts
305             .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove
306         // "combo"
307         // scripts
308         if (!metadataScripts.isEmpty()) {
309             // Warning, not error, so that we can add scripts to the script metadata
310             // and later update to the Unicode version that has characters for those scripts.
311             warnln("Script Metadata for characters not in Unicode: "
312                 + metadataScripts);
313         }
314     }
315 
TestMissingInfoForLanguage()316     public void TestMissingInfoForLanguage() {
317         CLDRFile english = CLDRConfig.getInstance().getEnglish();
318 
319         for (String language : CLDRConfig.getInstance().getCldrFactory()
320             .getAvailableLanguages()) {
321             if (language.contains("_") || language.equals("root")) {
322                 continue;
323             }
324             String likelyExpansion = likely.get(language);
325             if (likelyExpansion == null) {
326                 errln("Missing likely subtags for: " + language);
327             } else {
328                 logln("Likely subtags for " + language + ":\t " + likely);
329             }
330             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
331             String englishName = english.getStringValue(path);
332             if (englishName == null) {
333                 errln("Missing English translation for: " + language);
334             }
335         }
336     }
337 
TestMissingInfoForRegion()338     public void TestMissingInfoForRegion() {
339         CLDRFile english = CLDRConfig.getInstance().getEnglish();
340 
341         for (String region : StandardCodes.make().getGoodAvailableCodes(
342             "territory")) {
343             String likelyExpansion = likely.get("und_" + region);
344             if (likelyExpansion == null) {
345                 if (region.equals("ZZ") || region.equals("001") || region.equals("UN")
346                     || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not
347                     // container
348                     String likelyTag = LikelySubtags.maximize("und_" + region,
349                         likely);
350                     if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) {
351                         errln("Missing likely subtags for region: " + region
352                             + "\t" + english.getName("territory", region));
353                     }
354                 } else { // container
355                     errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): "
356                         + region
357                         + "\t"
358                         + english.getName("territory", region));
359                 }
360             } else {
361                 logln("Likely subtags for region: " + region + ":\t " + likely);
362             }
363             String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region);
364             String englishName = english.getStringValue(path);
365             if (englishName == null) {
366                 errln("Missing English translation for: " + region);
367             }
368         }
369     }
370 
371     static final Set<String> KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS = ImmutableSet.of("Hatr");
372 
TestMissingInfoForScript()373     public void TestMissingInfoForScript() {
374         VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion();
375         TreeSet<String> sorted = new TreeSet<>(
376             ScriptMetadata.getScripts());
377         Set<String> exceptions2 = new HashSet<>(
378             Arrays.asList("zh_Hans_CN", "hnj_Hmnp_US", "hnj_Hmng_LA", "iu_Cans_CA"));
379         for (String script : sorted) {
380             if (exceptions.contains(script) || script.equals("Latn")
381                 || script.equals("Dsrt")) {
382                 // we minimize away und_X, when the code puts in en...US
383                 continue;
384             }
385             Info i = ScriptMetadata.getInfo(script);
386             // System.out.println(i);
387             String likelyLanguage = i.likelyLanguage;
388             String originCountry = i.originCountry;
389             String undScript = "und_" + script;
390             String langScript = likelyLanguage + "_" + script + "_";
391             String likelyExpansion = likely.get(undScript);
392             if (likelyExpansion == null) {
393                 if (!KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS.contains(script)) {
394                     String msg = "likelySubtags.xml missing language for script (und_" + script
395                         + "). Script Metadata suggests that it should be something like:\t "
396                         + showOverride(script, originCountry, langScript);
397                     if (i.age.compareTo(icuUnicodeVersion) <= 0) {
398                         // Error: Missing data for a script in ICU's Unicode version.
399                         errln(msg);
400                     } else {
401                         // Warning: Missing data for a script in a future Unicode version.
402                         warnln(msg);
403                     }
404                 }
405             } else if (!exceptions2.contains(likelyExpansion)
406                 && !likelyExpansion.startsWith(langScript)) {
407                 // if
408                 // (logKnownIssue("Cldrbug:7181","Missing script metadata for "
409                 // + script)
410                 // && (script.equals("Tfng") || script.equals("Brah"))) {
411                 // logln("Wrong likely language for script (und_" + script +
412                 // "). Should not be " + likelyExpansion
413                 // + ", but something like:\t " + showOverride(script,
414                 // originCountry, langScript));
415                 // } else {
416                 errln("likelySubtags.xml has wrong language for script (und_" + script
417                     + "). Should not be " + likelyExpansion
418                     + ", but Script Metadata suggests something like:\t "
419                     + showOverride(script, originCountry, langScript));
420                 // }
421             } else {
422                 logln("OK: " + undScript + " => " + likelyExpansion);
423             }
424         }
425         /**
426          * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt =>
427          * en_Dsrt_US // fix US
428          */
429     }
430 
showOverride(String script, String originCountry, String langScript)431     public String showOverride(String script, String originCountry,
432         String langScript) {
433         return "{\"und_" + script + "\", \"" + langScript + originCountry
434             + "\"},";
435     }
436 }
437