• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.unittest;
2 
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Collections;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.LinkedHashSet;
10 import java.util.List;
11 import java.util.Map;
12 import java.util.Map.Entry;
13 import java.util.Set;
14 import java.util.TreeMap;
15 import java.util.TreeSet;
16 import java.util.regex.Matcher;
17 
18 import org.unicode.cldr.draft.ScriptMetadata;
19 import org.unicode.cldr.draft.ScriptMetadata.Info;
20 import org.unicode.cldr.tool.GenerateMaximalLocales;
21 import org.unicode.cldr.tool.LikelySubtags;
22 import org.unicode.cldr.util.Builder;
23 import org.unicode.cldr.util.CLDRConfig;
24 import org.unicode.cldr.util.CLDRFile;
25 import org.unicode.cldr.util.CLDRLocale;
26 import org.unicode.cldr.util.ChainedMap;
27 import org.unicode.cldr.util.ChainedMap.M3;
28 import org.unicode.cldr.util.CldrUtility;
29 import org.unicode.cldr.util.LanguageTagParser;
30 import org.unicode.cldr.util.LocaleIDParser;
31 import org.unicode.cldr.util.PatternCache;
32 import org.unicode.cldr.util.StandardCodes;
33 import org.unicode.cldr.util.SupplementalDataInfo;
34 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
35 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
36 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
37 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
38 import org.unicode.cldr.util.XPathParts;
39 
40 import com.ibm.icu.dev.test.TestFmwk;
41 import com.ibm.icu.dev.util.CollectionUtilities;
42 import com.ibm.icu.impl.Relation;
43 import com.ibm.icu.impl.Row.R2;
44 
45 public class TestInheritance extends TestFmwk {
46 
47     static CLDRConfig testInfo = CLDRConfig.getInstance();
48 
49     private static boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
50 
51     private static Matcher pathMatcher = PatternCache.get(
52         CldrUtility.getProperty("XPATH", ".*")).matcher("");
53 
main(String[] args)54     public static void main(String[] args) throws IOException {
55         new TestInheritance().run(args);
56     }
57 
58     private static final SupplementalDataInfo dataInfo = SupplementalDataInfo
59         .getInstance();
60     private static final Set<String> defaultContents = dataInfo
61         .getDefaultContentLocales();
62 
63     private static final boolean EXPECT_EQUALITY = false;
64 
65     private static Set<String> availableLocales = testInfo.getFullCldrFactory().getAvailable();
66 
TestLocalesHaveOfficial()67     public void TestLocalesHaveOfficial() {
68         // If we have a language, we have all the region locales where the
69         // language is official
70         Set<String> SKIP_TERRITORIES = new HashSet<String>(Arrays.asList("001",
71             "150"));
72         for (Entry<String, R2<List<String>, String>> s : dataInfo
73             .getLocaleAliasInfo().get("territory").entrySet()) {
74             SKIP_TERRITORIES.add(s.getKey());
75         }
76 
77         LanguageTagParser ltp = new LanguageTagParser();
78 
79         Relation<String, String> languageLocalesSeen = Relation.of(
80             new TreeMap<String, Set<String>>(), TreeSet.class);
81 
82         Set<String> testOrg = testInfo.getStandardCodes()
83             .getLocaleCoverageLocales("google");
84         ChainedMap.M4<String, OfficialStatus, String, Boolean> languageToOfficialChildren = ChainedMap
85             .of(new TreeMap<String, Object>(),
86                 new TreeMap<OfficialStatus, Object>(),
87                 new TreeMap<String, Object>(), Boolean.class);
88 
89         // gather the data
90 
91         for (String language : dataInfo
92             .getLanguagesForTerritoriesPopulationData()) {
93             for (String territory : dataInfo
94                 .getTerritoriesForPopulationData(language)) {
95                 if (SKIP_TERRITORIES.contains(territory)) {
96                     continue;
97                 }
98                 PopulationData data = dataInfo
99                     .getLanguageAndTerritoryPopulationData(language,
100                         territory);
101                 OfficialStatus status = data.getOfficialStatus();
102                 if (data.getOfficialStatus() != OfficialStatus.unknown) {
103                     String locale = removeScript(language + "_" + territory);
104                     String lang = removeScript(ltp.set(locale).getLanguage());
105                     languageToOfficialChildren.put(lang, status, locale,
106                         Boolean.TRUE);
107                     languageLocalesSeen.put(lang, locale);
108                 }
109             }
110         }
111 
112         // flesh it out by adding 'clean' codes.
113         // also get the child locales in cldr.
114 
115         Relation<String, String> languageToChildren = Relation.of(
116             new TreeMap<String, Set<String>>(), TreeSet.class);
117         for (String locale : testInfo.getCldrFactory().getAvailable()) {
118             String lang = ltp.set(locale).getLanguage();
119             if (SKIP_TERRITORIES.contains(ltp.getRegion())) {
120                 continue;
121             }
122             lang = removeScript(lang);
123             locale = removeScript(locale);
124 
125             if (!lang.equals(locale)) {
126                 languageToChildren.put(lang, locale);
127                 Set<String> localesSeen = languageLocalesSeen.get(lang);
128                 if (localesSeen == null || !localesSeen.contains(locale)) {
129                     languageToOfficialChildren.put(lang,
130                         OfficialStatus.unknown, locale, Boolean.TRUE);
131                 }
132             }
133         }
134 
135         for (Entry<String, Set<String>> languageAndChildren : languageToChildren
136             .keyValuesSet()) {
137             String language = languageAndChildren.getKey();
138             Set<String> children = languageAndChildren.getValue();
139             M3<OfficialStatus, String, Boolean> officalStatusToChildren = languageToOfficialChildren
140                 .get(language);
141             for (Entry<OfficialStatus, Map<String, Boolean>> entry : officalStatusToChildren) {
142                 OfficialStatus status = entry.getKey();
143                 if (status != OfficialStatus.official
144                     && status != OfficialStatus.de_facto_official) {
145                     continue;
146                 }
147                 Set<String> officalChildren = entry.getValue().keySet();
148                 if (!children.containsAll(officalChildren)) {
149                     Set<String> missing = new TreeSet<String>(officalChildren);
150                     missing.removeAll(children);
151                     String message = "Missing CLDR locales for " + status
152                         + " languages: " + missing;
153                     errln(message);
154                 } else {
155                     logln("CLDR locales " + children + " cover " + status
156                         + " locales " + officalChildren);
157                 }
158 
159             }
160         }
161 
162         if (DEBUG) {
163             Set<String> languages = new TreeSet<String>(
164                 languageToChildren.keySet());
165             languages.addAll(languageToOfficialChildren.keySet());
166             System.out.print("\ncode\tlanguage");
167             for (OfficialStatus status : OfficialStatus.values()) {
168                 System.out.print("\tNo\t" + status);
169             }
170             System.out.println();
171             for (String language : languages) {
172                 if (!testOrg.contains(language)) {
173                     continue;
174                 }
175                 System.out.print(language + "\t"
176                     + testInfo.getEnglish().getName(language));
177 
178                 M3<OfficialStatus, String, Boolean> officialChildren = languageToOfficialChildren
179                     .get(language);
180                 for (OfficialStatus status : OfficialStatus.values()) {
181                     Map<String, Boolean> children = officialChildren
182                         .get(status);
183                     if (children == null) {
184                         System.out.print("\t" + 0 + "\t");
185                     } else {
186                         System.out.print("\t" + children.size() + "\t"
187                             + show(children.keySet(), false));
188                     }
189                 }
190                 System.out.println();
191             }
192         }
193     }
194 
show(Set<String> joint, boolean showStatus)195     private String show(Set<String> joint, boolean showStatus) {
196         StringBuffer b = new StringBuffer();
197         for (String s : joint) {
198             if (b.length() != 0) {
199                 b.append(", ");
200             }
201             LanguageTagParser ltp = new LanguageTagParser().set(s);
202             String script = ltp.getScript();
203             if (script.length() != 0) {
204                 b.append(testInfo.getEnglish().getName(CLDRFile.SCRIPT_NAME,
205                     script));
206             }
207             String region = ltp.getRegion();
208             if (region.length() != 0) {
209                 if (script.length() != 0) {
210                     b.append("-");
211                 }
212                 b.append(testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME,
213                     region));
214             }
215             b.append(" [").append(s);
216             if (showStatus) {
217                 PopulationData data = dataInfo
218                     .getLanguageAndTerritoryPopulationData(
219                         ltp.getLanguage(), region);
220                 if (data == null) {
221                     data = dataInfo.getLanguageAndTerritoryPopulationData(
222                         ltp.getLanguageScript(), region);
223                 }
224                 b.append("; ");
225                 b.append(data == null ? "?" : data.getOfficialStatus());
226             }
227             b.append("]");
228 
229         }
230         return b.toString();
231     }
232 
removeScript(String lang)233     private String removeScript(String lang) {
234         if (!lang.contains("_")) {
235             return lang;
236         }
237         LanguageTagParser ltp = new LanguageTagParser().set(lang);
238         // String ls = ltp.getLanguageScript();
239         // if (defaultContents.contains(ls)) {
240         ltp.setScript("");
241         // }
242         return ltp.toString();
243     }
244 
TestLikelyAndDefaultConsistency()245     public void TestLikelyAndDefaultConsistency() {
246         LikelySubtags likelySubtags = new LikelySubtags();
247         LanguageTagParser ltp = new LanguageTagParser();
248         // find multiscript locales
249         Relation<String, String> base2scripts = Relation.of(
250             new TreeMap<String, Set<String>>(), TreeSet.class);
251         Map<String, String> parent2default = new TreeMap<String, String>();
252         Map<String, String> default2parent = new TreeMap<String, String>();
253         Relation<String, String> base2locales = Relation.of(
254             new TreeMap<String, Set<String>>(), TreeSet.class);
255 
256         Set<String> knownMultiScriptLanguages = new HashSet<String>(Arrays.asList("bm", "ha"));
257         // get multiscript locales
258         for (String localeID : availableLocales) {
259             String script = ltp.set(localeID).getScript();
260             final String base = ltp.getLanguage();
261             if (!availableLocales.contains(base)) {
262                 errln("Missing base locale for: " + localeID);
263             }
264             base2locales.put(base, localeID);
265             if (!script.isEmpty() && !base.equals("en")) { // HACK for en
266                 base2scripts.put(base, script);
267             }
268             if (script.isEmpty() && knownMultiScriptLanguages.contains(base)) {
269                 base2scripts.put(base, dataInfo.getDefaultScript(base));
270             }
271         }
272 
273         // get default contents
274         for (String localeID : defaultContents) {
275             checkLocale(localeID, false);
276             String simpleParent = LocaleIDParser.getSimpleParent(localeID);
277             parent2default.put(simpleParent, localeID);
278             default2parent.put(localeID, simpleParent);
279             // if (!available.contains(simpleParent)) {
280             // // verify that base language has locale in CLDR (we don't want
281             // others)
282             // errln("Default contents contains locale not in CLDR:\t" +
283             // simpleParent);
284             // }
285         }
286 
287         // get likely
288         Map<String, String> likely2Maximized = likelySubtags.getToMaximized();
289         for (Entry<String, String> likelyAndMaximized : likely2Maximized
290             .entrySet()) {
291             checkLocale(likelyAndMaximized.getKey(), true);
292             checkLocale(likelyAndMaximized.getValue(), true);
293         }
294         Map<String, String> exceptionDcLikely = new HashMap<String, String>();
295         Map<String, String> exceptionLikelyDc = new HashMap<String, String>();
296         for (String[] s : new String[][] { { "ar_001", "ar_Arab_EG" }, }) {
297             exceptionDcLikely.put(s[0], s[1]);
298             exceptionLikelyDc.put(s[1], s[0]);
299         }
300 
301         verifyDefaultContentsImplicationsForLikelySubtags(ltp, parent2default,
302             likely2Maximized, exceptionDcLikely);
303 
304         verifyLikelySubtagsImplicationsForDefaultContents(ltp, base2scripts,
305             parent2default, likely2Maximized, exceptionLikelyDc);
306 
307         verifyScriptsWithDefaultContents(ltp, base2scripts, parent2default,
308             base2locales);
309     }
310 
TestParentLocaleRelationships()311     public void TestParentLocaleRelationships() {
312         // Testing invariant relationships between locales - See
313         // http://unicode.org/cldr/trac/ticket/5758
314         Matcher langScript = PatternCache.get("^[a-z]{2,3}_[A-Z][a-z]{3}$")
315             .matcher("");
316         for (String loc : availableLocales) {
317             if (langScript.reset(loc).matches()) {
318                 String expectedParent = loc.split("_")[0];
319                 if (!defaultContents.contains(loc)) {
320                     expectedParent = "root";
321                 }
322                 String actualParent = dataInfo.getExplicitParentLocale(loc);
323                 if (actualParent == null) {
324                     actualParent = loc.split("_")[0];
325                 }
326                 if (!actualParent.equals(expectedParent)) {
327                     errln("Unexpected parent locale for locale " + loc
328                         + ". Expected: " + expectedParent + " Got: "
329                         + actualParent);
330                 }
331 
332                 if (dataInfo.getExplicitParentLocale(loc) != null
333                     && defaultContents.contains(loc)) {
334                     errln("Locale "
335                         + loc
336                         + " can't have an explicit parent AND be a default content locale");
337                 }
338             }
339         }
340     }
341 
TestParentLocaleInvariants()342     public void TestParentLocaleInvariants() {
343         // Testing invariant relationships in parent locales - See
344         // http://unicode.org/cldr/trac/ticket/7887
345         LocaleIDParser lp = new LocaleIDParser();
346         for (String loc : availableLocales) {
347             String parentLocale = dataInfo.getExplicitParentLocale(loc);
348             if (parentLocale != null) {
349                 if (!"root".equals(parentLocale)
350                     && !lp.set(loc).getLanguage()
351                         .equals(lp.set(parentLocale).getLanguage())) {
352                     errln("Parent locale [" + parentLocale + "] for locale ["
353                         + loc + "] cannot be a different language code.");
354                 }
355                 if (!"root".equals(parentLocale)
356                     && !lp.set(loc).getScript()
357                         .equals(lp.set(parentLocale).getScript())) {
358                     errln("Parent locale [" + parentLocale + "] for locale ["
359                         + loc + "] cannot be a different script code.");
360                 }
361                 lp.set(loc);
362                 if (lp.getScript().length() == 0 && lp.getRegion().length() == 0) {
363                     errln("Base language locale [" + loc + "] cannot have an explicit parent.");
364                 }
365 
366             }
367         }
368     }
369 
TestParentLocalesForCycles()370     public void TestParentLocalesForCycles() {
371         // Testing for cyclic relationships in parent locales - See
372         // http://unicode.org/cldr/trac/ticket/7887
373         for (String loc : availableLocales) {
374             String currentLoc = loc;
375             boolean foundError = false;
376             List<String> inheritanceChain = new ArrayList<String>(Arrays.asList(loc));
377             while (currentLoc != null && !foundError) {
378                 currentLoc = LocaleIDParser.getParent(currentLoc);
379                 if (inheritanceChain.contains(currentLoc)) {
380                     foundError = true;
381                     inheritanceChain.add(currentLoc);
382                     errln("Inheritance chain for locale [" + loc + "] contains a cyclic relationship. " + inheritanceChain.toString());
383                 }
384                 inheritanceChain.add(currentLoc);
385             }
386         }
387     }
388 
verifyScriptsWithDefaultContents(LanguageTagParser ltp, Relation<String, String> base2scripts, Map<String, String> parent2default, Relation<String, String> base2locales)389     private void verifyScriptsWithDefaultContents(LanguageTagParser ltp,
390         Relation<String, String> base2scripts,
391         Map<String, String> parent2default,
392         Relation<String, String> base2locales) {
393         Set<String> skip = Builder.with(new HashSet<String>())
394             .addAll("root", "und")
395             .freeze();
396         Set<String> languagesWithOneOrLessLocaleScriptInCommon = new HashSet<String>(Arrays.asList("bm", "ha", "ms", "iu", "mn"));
397         // for each base we have to have,
398         // if multiscript, we have default contents for base+script,
399         // base+script+region;
400         // otherwise base+region.
401         for (String base : base2locales.keySet()) {
402             if (skip.contains(base)) {
403                 continue;
404             }
405             String defaultContent = parent2default.get(base);
406             // Set<String> likely = base2likely.get(base);
407             // if (likely == null) {
408             // errln("Missing likely subtags for: " + base + "  " +
409             // suggestLikelySubtagFor(base));
410             // }
411             if (defaultContent == null) {
412                 errln("Missing default content for: " + base + "  "
413                     + suggestLikelySubtagFor(base));
414                 continue;
415             }
416             Set<String> scripts = base2scripts.get(base);
417             ltp.set(defaultContent);
418             String script = ltp.getScript();
419             String region = ltp.getRegion();
420             if (scripts == null || languagesWithOneOrLessLocaleScriptInCommon.contains(base)) {
421                 if (!script.isEmpty()) {
422                     errln("Script should be empty in default content for: "
423                         + base + "," + defaultContent);
424                 }
425                 if (region.isEmpty()) {
426                     errln("Region must not be empty in default content for: "
427                         + base + "," + defaultContent);
428                 }
429             } else {
430                 if (script.isEmpty()) {
431                     errln("Script should not be empty in default content for: "
432                         + base + "," + defaultContent);
433                 }
434                 if (!region.isEmpty()) {
435                     errln("Region should be empty in default content for: "
436                         + base + "," + defaultContent);
437                 }
438                 String defaultContent2 = parent2default.get(defaultContent);
439                 if (defaultContent2 == null) {
440                     errln("Missing default content for: " + defaultContent);
441                     continue;
442                 }
443                 ltp.set(defaultContent2);
444                 region = ltp.getRegion();
445                 if (region.isEmpty()) {
446                     errln("Region must not be empty in default content for: "
447                         + base + "," + defaultContent);
448                 }
449             }
450         }
451     }
452 
verifyLikelySubtagsImplicationsForDefaultContents( LanguageTagParser ltp, Relation<String, String> base2scripts, Map<String, String> parent2default, Map<String, String> likely2Maximized, Map<String, String> exceptionLikelyDc)453     private void verifyLikelySubtagsImplicationsForDefaultContents(
454         LanguageTagParser ltp, Relation<String, String> base2scripts,
455         Map<String, String> parent2default,
456         Map<String, String> likely2Maximized,
457         Map<String, String> exceptionLikelyDc) {
458         // Now check invariants for all LikelySubtags implications for Default
459         // Contents
460         // a) suppose likely max for la_Scrp => la_Scrp_RG
461         // Then default contents la_Scrp => la_Scrp_RG
462         // b) suppose likely max for la_RG => la_Scrp_RG
463         // Then we can draw no conclusions // was default contents la_Scrp =>
464         // la_Scrp_RG
465         // c) suppose likely max for la => la_Scrp_RG
466         // Then default contents la => la_Scrp && la_Scrp => la_Scrp_RG
467         // or default contents la => la_RG && ! la_Scrp => la_Scrp_RG
468 
469         TreeSet<String> additionalDefaultContents = new TreeSet<String>();
470 
471         for (Entry<String, String> entry : likely2Maximized.entrySet()) {
472             String source = entry.getKey();
473             String likelyMax = entry.getValue();
474             String sourceLang = ltp.set(source).getLanguage();
475             if (sourceLang.equals("und") || source.equals("zh_Hani")
476                 || source.equals("tl")) {
477                 continue;
478             }
479             String sourceScript = ltp.getScript();
480             String sourceRegion = ltp.getRegion();
481 
482             String likelyMaxLang = ltp.set(likelyMax).getLanguage();
483             String likelyMaxScript = ltp.getScript();
484             String likelyMaxRegion = ltp.getRegion();
485 
486             String dc = parent2default.get(source);
487             String possibleException = exceptionLikelyDc.get(likelyMax);
488             if (possibleException != null && possibleException.equals(dc)) {
489                 continue;
490             }
491             String likelyLangScript = likelyMaxLang + "_" + likelyMaxScript;
492             String dcFromLangScript = parent2default.get(likelyLangScript);
493 
494             boolean consistent = true;
495             String caseNumber = null;
496             if (consistent) {
497                 if (!sourceScript.isEmpty()) {
498                     caseNumber = "a";
499                     if (dc == null) {
500                         if (EXPECT_EQUALITY) {
501                             String expected = likelyMax;
502                             errln("Default contents null for " + source
503                                 + ", expected:\t" + expected);
504                             additionalDefaultContents.add(expected);
505                         }
506                         continue;
507                     }
508                     consistent = likelyMax.equals(dc);
509                 } else if (!sourceRegion.isEmpty()) { // a
510                     caseNumber = "b";
511                     // consistent = likelyMax.equals(dcFromLangScript);
512                 } else { // c
513                     caseNumber = "c";
514                     if (dc == null) {
515                         if (EXPECT_EQUALITY) {
516                             String expected = base2scripts.get(source) == null ? likelyMaxLang
517                                 + "_" + likelyMaxRegion
518                                 : likelyMaxLang + "_" + likelyMaxScript;
519                             errln("Default contents null for " + source
520                                 + ", expected:\t" + expected);
521                             additionalDefaultContents.add(expected);
522                         }
523                         continue;
524                     }
525                     String dcScript = ltp.set(dc).getScript();
526                     consistent = likelyLangScript.equals(dc)
527                         && likelyMax.equals(dcFromLangScript)
528                         || dcScript.isEmpty()
529                             && !likelyMax.equals(dcFromLangScript);
530                     // || dcScript.isEmpty() && dcRegion.equals(likelyMaxRegion)
531                     // && dcFromLangScript == null;
532                 }
533             }
534             if (!consistent) {
535                 errln("default contents inconsistent with likely subtag: ("
536                     + caseNumber + ")" + "\n\t" + source + " => (ls) "
537                     + likelyMax + "\n\t" + source + " => (dc) " + dc
538                     + "\n\t" + likelyLangScript + " => (dc) "
539                     + dcFromLangScript);
540             }
541         }
542         if (additionalDefaultContents.size() != 0) {
543             errln("Suggested additions to supplementalMetadata/../defaultContent:\n"
544                 + CollectionUtilities.join(additionalDefaultContents, " "));
545         }
546     }
547 
verifyDefaultContentsImplicationsForLikelySubtags( LanguageTagParser ltp, Map<String, String> parent2default, Map<String, String> likely2Maximized, Map<String, String> exceptionDcLikely)548     private void verifyDefaultContentsImplicationsForLikelySubtags(
549         LanguageTagParser ltp, Map<String, String> parent2default,
550         Map<String, String> likely2Maximized,
551         Map<String, String> exceptionDcLikely) {
552         // Now check invariants for all Default Contents implications for
553         // LikelySubtags
554         // a) suppose default contents la => la_Scrp.
555         // Then the likely contents for la => la_Scrp_*
556         // b) suppose default contents la => la_RG.
557         // Then the likely contents for la => la_*_RG
558         // c) suppose default contents la_Scrp => la_Scrp_RG.
559         // Then the likely contents of la_Scrp => la_Scrp_RG OR likely contents
560         // for la => la_*_*
561         for (Entry<String, String> parentAndDefault : parent2default.entrySet()) {
562             String source = parentAndDefault.getKey();
563             String dc = parentAndDefault.getValue();
564             String likelyMax = likely2Maximized.get(source);
565 
566             // skip special exceptions
567             String possibleException = exceptionDcLikely.get(dc);
568             if (possibleException != null
569                 && possibleException.equals(likelyMax)) {
570                 continue;
571             }
572 
573             String sourceLang = ltp.set(source).getLanguage();
574             String sourceScript = ltp.getScript();
575             // there cannot be a sourceRegion
576 
577             String dcScript = ltp.set(dc).getScript();
578             String dcRegion = ltp.getRegion();
579 
580             String likelyMaxLang = "", likelyMaxScript = "", likelyMaxRegion = "";
581             if (likelyMax != null) {
582                 likelyMaxLang = ltp.set(likelyMax).getLanguage();
583                 likelyMaxScript = ltp.getScript();
584                 likelyMaxRegion = ltp.getRegion();
585             }
586 
587             String likelyMax2 = likely2Maximized.get(sourceLang);
588 
589             boolean consistent = true;
590 
591             if (sourceScript.isEmpty()) { // a or b
592                 if (!dcScript.isEmpty()) { // a
593                     consistent = likelyMaxLang.equals(source)
594                         && likelyMaxScript.equals(dcScript);
595                 } else { // b
596                     consistent = likelyMaxLang.equals(source)
597                         && likelyMaxRegion.equals(dcRegion);
598                 }
599             } else { // c
600                 consistent = dc.equals(likelyMax) || likelyMax2 != null;
601             }
602             if (!consistent) {
603                 errln("likely subtag inconsistent with default contents: "
604                     + "\n\t"
605                     + source
606                     + " =>( dc) "
607                     + dc
608                     + "\n\t"
609                     + source
610                     + " => (ls) "
611                     + likelyMax
612                     + (source.equals(sourceLang) ? "" : "\n\t" + sourceLang
613                         + " => (ls) " + likelyMax2));
614             }
615         }
616     }
617 
618     /**
619      * Suggest a likely subtag
620      *
621      * @param base
622      * @return
623      */
suggestLikelySubtagFor(String base)624     static String suggestLikelySubtagFor(String base) {
625         SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
626 
627         CLDRLocale loc = CLDRLocale.getInstance(base);
628 
629         if (!loc.getLanguage().equals(base)) {
630             return " (no suggestion- not a simple language locale)"; // no
631             // suggestion
632             // unless
633             // just
634             // a
635             // language
636             // locale.
637         }
638         Set<BasicLanguageData> basicData = sdi.getBasicLanguageData(base);
639 
640         for (BasicLanguageData bld : basicData) {
641             if (bld.getType() == org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type.primary) {
642                 Set<String> scripts = bld.getScripts();
643                 Set<String> territories = bld.getTerritories();
644 
645                 if (scripts.size() == 1) {
646                     if (territories.size() == 1) {
647                         return createSuggestion(
648                             loc,
649                             CLDRLocale.getInstance(base + "_"
650                                 + scripts.iterator().next() + "_"
651                                 + territories.iterator().next()));
652                     }
653                 }
654                 return "(no suggestion - multiple scripts or territories)";
655             }
656         }
657         return ("(no suggestion- no data)");
658     }
659 
660     /**
661      * Format and return a suggested likelysubtag
662      */
createSuggestion(CLDRLocale loc, CLDRLocale toLoc)663     private static String createSuggestion(CLDRLocale loc, CLDRLocale toLoc) {
664         return " Suggest this to likelySubtags.xml:        <likelySubtag from=\""
665             + loc
666             + "\" to=\""
667             + toLoc
668             + "\"/>\n"
669             + "        <!--{ "
670             + loc.getDisplayName()
671             + "; ?; ? } => { "
672             + loc.getDisplayName()
673             + "; "
674             + toLoc.toULocale().getDisplayScript()
675             + "; "
676             + toLoc.toULocale().getDisplayCountry() + " }-->";
677 
678     }
679 
TestDeprecatedTerritoryDataLocaleIds()680     public void TestDeprecatedTerritoryDataLocaleIds() {
681         HashSet<String> checked = new HashSet<String>();
682         for (String language : dataInfo
683             .getLanguagesForTerritoriesPopulationData()) {
684             checkLocale(language, false); // checks la_Scrp and la
685             for (String region : dataInfo
686                 .getTerritoriesForPopulationData(language)) {
687                 if (!checked.contains(region)) {
688                     checkValidCode(language + "_" + region, "territory",
689                         region, false);
690                     checked.add(region);
691                 }
692             }
693         }
694         for (String language : dataInfo.getBasicLanguageDataLanguages()) {
695             checkLocale(language, false); // checks la_Scrp and la
696             Set<BasicLanguageData> data = dataInfo
697                 .getBasicLanguageData(language);
698             for (BasicLanguageData datum : data) {
699                 for (String script : datum.getScripts()) {
700                     checkValidCode(language + "_" + script, "script", script,
701                         false);
702                     checked.add(script);
703                 }
704                 for (String region : datum.getTerritories()) {
705                     checkValidCode(language + "_" + region, "territory",
706                         region, false);
707                     checked.add(region);
708                 }
709             }
710         }
711 
712     }
713 
TestBasicLanguageDataAgainstScriptMetadata()714     public void TestBasicLanguageDataAgainstScriptMetadata() {
715         // the invariants are:
716         // if there is primary data, the script must be there
717         // otherwise it must be in the secondary
718         main: for (String script : ScriptMetadata.getScripts()) {
719             Info info = ScriptMetadata.getInfo(script);
720             String language = info.likelyLanguage;
721             if (language.equals("und")) {
722                 continue;
723             }
724             Map<Type, BasicLanguageData> data = dataInfo
725                 .getBasicLanguageDataMap(language);
726             if (data == null) {
727                 logln("Warning: ScriptMetadata has " + language + " for "
728                     + script + "," + " but " + language
729                     + " is missing in language_script.txt");
730                 continue;
731             }
732             for (BasicLanguageData entry : data.values()) {
733                 if (entry.getScripts().contains(script)) {
734                     continue main;
735                 }
736                 continue;
737             }
738             logln("Warning: ScriptMetadata has " + language + " for " + script
739                 + "," + " but " + language + " doesn't have " + script
740                 + " in language_script.txt");
741         }
742     }
743 
TestCldrFileConsistency()744     public void TestCldrFileConsistency() {
745         boolean haveErrors = false;
746         for (String locale : testInfo.getCldrFactory().getAvailable()) {
747             CLDRFile cldrFileToCheck = testInfo.getCLDRFile(locale,
748                 false);
749             int errors = 0;
750             for (String path : cldrFileToCheck) {
751                 if (!pathMatcher.reset(path).find()) {
752                     continue;
753                 }
754                 String fullPath = cldrFileToCheck.getFullXPath(path);
755                 if (fullPath == null) {
756                     // try again, for debugging
757                     fullPath = cldrFileToCheck.getFullXPath(path);
758                     String value = cldrFileToCheck.getStringValue(path);
759                     if (DEBUG) {
760                         errln("Invalid full path\t" + locale + ", " + path
761                             + ", " + fullPath + ", " + value);
762                     }
763                     errors++;
764                     haveErrors = true;
765                 }
766             }
767             if (errors != 0) {
768                 errln(locale
769                     + (errors != 0 ? "\tinvalid getFullXPath() values:"
770                         + errors : ""));
771             } else {
772                 logln(locale);
773             }
774         }
775         if (haveErrors && !DEBUG) {
776             errln("Use -DDEBUG to see details");
777         }
778     }
779 
780     static SupplementalDataInfo info = SupplementalDataInfo.getInstance();
781     LanguageTagParser ltp = new LanguageTagParser();
782 
783     // public void TestAliases() {
784     // Factory factory = Factory.make(CldrUtility.MAIN_DIRECTORY, fileMatcher);
785     // Set<String> allLocales = Factory.make(CldrUtility.MAIN_DIRECTORY,
786     // ".*").getAvailable();
787     //
788     // LanguageTagCanonicalizer languageTagCanonicalizer = new
789     // LanguageTagCanonicalizer();
790     //
791     // Set<String> defaultContents = info.getDefaultContentLocales();
792     //
793     // Map<String, String> likelySubtags = info.getLikelySubtags();
794     //
795     // XPathParts xpp = new XPathParts();
796     //
797     // // get the top level aliases, and verify that they are consistent with
798     // // maximization
799     // Map<String, String> topLevelAliases = new TreeMap<String, String>();
800     // Set<String> crossScriptSet = new TreeSet<String>();
801     // Set<String> aliasPaths = new TreeSet<String>();
802     // Set<String> locales = factory.getAvailable();
803     //
804     // // get the languages that need scripts
805     // // TODO broaden to beyond CLDR
806     // Set<String> needScripts = new TreeSet<String>();
807     // for (String locale : locales) {
808     // String script = ltp.set(locale).getScript();
809     // if (script.length() != 0) {
810     // needScripts.add(ltp.getLanguage());
811     // }
812     // }
813     //
814     // logln("Languages that have scripts:\t" + needScripts);
815     //
816     // for (String locale : locales) {
817     //
818     // // get alias locale
819     // String aliasLocale = locale;
820     // String explicitAlias = null;
821     // String aliasPathNew = null;
822     // CLDRFile cldrFileToCheck = factory.make(locale, false);
823     // aliasPaths.clear();
824     // // examples:
825     // // in: <alias source="id" path="//ldml"/>
826     // // ar_IR: <alias source="az_Arab_IR" path="//ldml"/>
827     //
828     // cldrFileToCheck.getPaths("//ldml/alias", null, aliasPaths);
829     // if (aliasPaths.size() != 0) {
830     // String aliasPath = aliasPaths.iterator().next();
831     // String fullPath = cldrFileToCheck.getFullXPath(aliasPath);
832     // explicitAlias = aliasLocale = xpp.set(fullPath).getAttributeValue(1,
833     // "source");
834     // String aliasParent = LocaleIDParser.getParent(aliasLocale);
835     // if (!aliasParent.equals("root")) {
836     // topLevelAliases.put(locale, aliasParent);
837     // }
838     // aliasPathNew = xpp.set(fullPath).getAttributeValue(1, "path");
839     // if ("//ldml/".equals(aliasPathNew)) {
840     // errln("Bad alias path:\t" + fullPath);
841     // }
842     // }
843     //
844     // checkAliasValues(cldrFileToCheck, allLocales);
845     //
846     // // get canonicalized
847     // String canonicalizedLocale = languageTagCanonicalizer.transform(locale);
848     // if (!locale.equals(canonicalizedLocale)) {
849     // logln("Locale\t" + locale + " => " + canonicalizedLocale);
850     // }
851     //
852     // String base = ltp.set(canonicalizedLocale).getLanguage();
853     // String script = ltp.getScript();
854     // if (canonicalizedLocale.equals(base)) { // eg, id, az
855     // continue;
856     // }
857     //
858     // // see if the locale's default script is the same as the base locale's
859     //
860     // String maximized = maximize(likelySubtags, canonicalizedLocale);
861     // if (maximized == null) {
862     // errln("Missing likely subtags for:\t" + locale + "  " +
863     // suggestLikelySubtagFor(locale));
864     // continue;
865     // }
866     // String maximizedScript = ltp.set(maximized).getScript();
867     //
868     // String minimized = minimize(likelySubtags, canonicalizedLocale);
869     //
870     // String baseMaximized = maximize(likelySubtags, base);
871     // String baseScript = ltp.set(baseMaximized).getScript();
872     //
873     // if (script.length() != 0 && !script.equals(baseScript)) {
874     // crossScriptSet.add(ltp.set(locale).getLanguageScript());
875     // }
876     //
877     // // Finally, put together the expected alias for comparison.
878     // // It is the "best" alias, in that the default-content locales are
879     // skipped in favor of their parents
880     //
881     // String expectedAlias =
882     // !baseScript.equals(maximizedScript) ? minimized :
883     // !locale.equals(canonicalizedLocale) ? canonicalizedLocale :
884     // // needScripts.contains(base) ? ltp.getLanguageScript() :
885     // locale;
886     //
887     // if (!equals(aliasLocale, expectedAlias)) {
888     // String aliasMaximized = maximize(likelySubtags, aliasLocale);
889     // String expectedMaximized = maximize(likelySubtags, expectedAlias);
890     // if (!equals(aliasMaximized, expectedMaximized)) {
891     // errln("For locale:\t" + locale
892     // + ",\tbase-script:\t" + baseScript
893     // + ",\texpected alias Locale != actual alias Locale:\t"
894     // + expectedAlias + ", " + aliasLocale);
895     // } else if (explicitAlias == null) {
896     // // skip, we don't care in this case
897     // // but we emit warnings if the other conditions are true. The aliasing
898     // could be simpler.
899     // } else if (equals(expectedAlias, locale)) {
900     // logln("Warning; alias could be omitted. For locale:\t" + locale
901     // + ",\tbase-script:\t" + baseScript
902     // + ",\texpected alias Locale != actual alias Locale:\t"
903     // + expectedAlias + ", " + aliasLocale);
904     // } else {
905     // logln("Warning; alias could be minimized. For locale:\t" + locale
906     // + ",\tbase-script:\t" + baseScript
907     // + ",\texpected alias Locale != actual alias Locale:\t"
908     // + expectedAlias + ", " + aliasLocale);
909     // }
910     // }
911     // }
912     //
913     // // check the LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES value and make sure
914     // it matches what is in the files in main/
915     //
916     // if (!topLevelAliases.equals(LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES)
917     // && locales.equals(allLocales)) {
918     // String diff = showDifferences(LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES,
919     // topLevelAliases);
920     // if (!diff.isEmpty()) {
921     // errln("LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES ≠ topLevelAliases: " +
922     // diff);
923     // }
924     // StringBuilder result = new StringBuilder(
925     // "Suggest changing LocaleIDParser.TOP_LEVEL_ALIAS_LOCALES to:\n");
926     // for (Entry<String, String> entry : topLevelAliases.entrySet()) {
927     // result.append("\t.put(\"")
928     // .append(entry.getKey())
929     // .append("\", \"")
930     // .append(entry.getValue())
931     // .append("\")\n");
932     // }
933     // errln(result.toString());
934     // } else {
935     // logln("Top Level Aliases:\t" + topLevelAliases);
936     // }
937     //
938     // // verify that they are the same as what we would get if we were to
939     // maximize
940     // // all the locales and check against default_contents
941     //
942     // for (String locale : defaultContents) {
943     // CLDRFile cldrFileToCheck = null;
944     // try {
945     // cldrFileToCheck = factory.make(locale, false);
946     // } catch (Exception e) {}
947     // if (cldrFileToCheck == null) {
948     // logln("Present in default contents but has no XML file:\t" + locale);
949     // continue;
950     // }
951     // logln("Locale:\t" + locale);
952     // // verify empty, except for identity elements and alias
953     // for (String path : cldrFileToCheck) {
954     // if (path.contains("/identity/")) {
955     // continue;
956     // }
957     // errln("Default content locale not empty:\t" + locale + ", " + path);
958     // break;
959     // }
960     // }
961     // }
962 
963     Matcher aliasMatcher = PatternCache.get("//ldml.*/alias.*").matcher("");
964 
checkAliasValues(CLDRFile cldrFileToCheck, Set<String> locales)965     private void checkAliasValues(CLDRFile cldrFileToCheck, Set<String> locales) {
966         Set<String> aliasPaths = new TreeSet<String>();
967         Set<String> allAliasPaths = cldrFileToCheck.getPaths("//ldml/",
968             aliasMatcher, aliasPaths);
969         XPathParts xpp = new XPathParts();
970         for (String aliasPath : allAliasPaths) {
971             if (aliasPath.startsWith("//ldml/alias")) {
972                 continue; // we have different tests elsewhere
973             }
974             String fullPath = cldrFileToCheck.getFullXPath(aliasPath);
975             String aliasLocale = xpp.set(fullPath).getAttributeValue(-1,
976                 "source");
977             // just check to make sure that the alias is in the locales
978             if (aliasLocale != null && !aliasLocale.equals("locale")) {
979                 if (!locales.contains(aliasLocale)) {
980                     errln("Unknown Alias:\t" + aliasLocale + "\t in\t"
981                         + fullPath);
982                 }
983             }
984             String aliasPathNew = xpp.set(fullPath).getAttributeValue(-1,
985                 "path");
986             // just one check
987             if (".".equals(aliasPathNew)) {
988                 errln("Illegal path, must not be .:\t" + aliasLocale
989                     + "\t in\t" + fullPath);
990             }
991 
992         }
993     }
994 
minimize(Map<String, String> likelySubtags, String locale)995     private String minimize(Map<String, String> likelySubtags, String locale) {
996         String result = GenerateMaximalLocales.minimize(locale, likelySubtags,
997             false);
998         if (result == null) {
999             LanguageTagParser ltp3 = new LanguageTagParser().set(locale);
1000             List<String> variants = ltp3.getVariants();
1001             Map<String, String> extensions = ltp3.getExtensions();
1002             Set<String> emptySet = Collections.emptySet();
1003             ltp3.setVariants(emptySet);
1004             Map<String, String> emptyMap = Collections.emptyMap();
1005             ltp3.setExtensions(emptyMap);
1006             String newLocale = ltp3.toString();
1007             result = GenerateMaximalLocales.minimize(newLocale, likelySubtags,
1008                 false);
1009             if (result != null) {
1010                 ltp3.set(result);
1011                 ltp3.setVariants(variants);
1012                 ltp3.setExtensions(extensions);
1013                 result = ltp3.toString();
1014             }
1015         }
1016         return result;
1017     }
1018 
maximize(Map<String, String> likelySubtags, String locale)1019     private String maximize(Map<String, String> likelySubtags, String locale) {
1020         String result = GenerateMaximalLocales.maximize(locale, likelySubtags);
1021         if (result == null) {
1022             LanguageTagParser ltp3 = new LanguageTagParser().set(locale);
1023             List<String> variants = ltp3.getVariants();
1024             Map<String, String> extensions = ltp3.getExtensions();
1025             Set<String> emptySet = Collections.emptySet();
1026             ltp3.setVariants(emptySet);
1027             Map<String, String> emptyMap = Collections.emptyMap();
1028             ltp3.setExtensions(emptyMap);
1029             String newLocale = ltp3.toString();
1030             result = GenerateMaximalLocales.maximize(newLocale, likelySubtags);
1031             if (result != null) {
1032                 ltp3.set(result);
1033                 ltp3.setVariants(variants);
1034                 ltp3.setExtensions(extensions);
1035                 result = ltp3.toString();
1036             }
1037         }
1038         return result;
1039     }
1040 
1041     // TODO move this into central utilities
equals(CharSequence string, int codePoint)1042     public static boolean equals(CharSequence string, int codePoint) {
1043         if (string == null) {
1044             return false;
1045         }
1046         switch (string.length()) {
1047         case 1:
1048             return codePoint == string.charAt(0);
1049         case 2:
1050             return codePoint >= 0x10000
1051                 && codePoint == Character.codePointAt(string, 0);
1052         default:
1053             return false;
1054         }
1055     }
1056 
1057     // TODO move this into central utilities
1058 
1059     private static final StandardCodes STANDARD_CODES = testInfo.getStandardCodes();
1060     private static final Map<String, Map<String, R2<List<String>, String>>> DEPRECATED_INFO = dataInfo
1061         .getLocaleAliasInfo();
1062 
checkLocale(String localeID, boolean allowDeprecated)1063     private void checkLocale(String localeID, boolean allowDeprecated) {
1064         // verify that the localeID is valid
1065         LanguageTagParser ltp = new LanguageTagParser().set(localeID);
1066         String language = ltp.getLanguage();
1067         String script = ltp.getScript();
1068         String region = ltp.getRegion();
1069         // TODO check variants, extensions also.
1070         checkValidCode(localeID, "language", language, allowDeprecated);
1071         checkValidCode(localeID, "script", script, allowDeprecated);
1072         checkValidCode(localeID, "territory", region, allowDeprecated);
1073     }
1074 
checkValidCode(String localeID, String subtagType, String subtag, boolean allowDeprecated)1075     private void checkValidCode(String localeID, String subtagType,
1076         String subtag, boolean allowDeprecated) {
1077         if (subtagType.equals("language")) {
1078             if (subtag.equals("und")) {
1079                 return;
1080             }
1081         } else {
1082             if (subtag.isEmpty()) {
1083                 return;
1084             }
1085         }
1086         if (!STANDARD_CODES.getAvailableCodes(subtagType).contains(subtag)) {
1087             errln("Locale " + localeID + " contains illegal "
1088                 + showCode(subtagType, subtag));
1089         } else if (!allowDeprecated) {
1090             // "language" -> "sh" -> <{"sr_Latn"}, reason>
1091             R2<List<String>, String> deprecatedInfo = DEPRECATED_INFO.get(
1092                 subtagType).get(subtag);
1093             if (deprecatedInfo != null) {
1094                 errln("Locale " + localeID + " contains deprecated "
1095                     + showCode(subtagType, subtag) + " "
1096                     + deprecatedInfo.get1() + "; suggest "
1097                     + showName(deprecatedInfo.get0(), subtagType));
1098             }
1099         }
1100     }
1101 
showName(List<String> deprecatedInfo, String subtagType)1102     private String showName(List<String> deprecatedInfo, String subtagType) {
1103         StringBuilder result = new StringBuilder();
1104         for (String s : deprecatedInfo) {
1105             result.append(showName(subtagType, s)).append(" ");
1106         }
1107         return result.toString();
1108     }
1109 
showCode(String subtagType, String subtag)1110     private String showCode(String subtagType, String subtag) {
1111         return subtagType + " code: " + showName(subtagType, subtag);
1112     }
1113 
showName(String subtagType, String subtag)1114     private String showName(String subtagType, String subtag) {
1115         return subtag + " (" + getName(subtagType, subtag) + ")";
1116     }
1117 
getName(String subtagType, String subtag)1118     private String getName(String subtagType, String subtag) {
1119         Map<String, String> data = STANDARD_CODES.getLangData(subtagType,
1120             subtag);
1121         if (data == null) {
1122             return "<no name>";
1123         }
1124         return data.get("Description");
1125     }
1126 
1127     // TODO move this into central utilities
equals(int codePoint, CharSequence string)1128     public static boolean equals(int codePoint, CharSequence string) {
1129         return equals(string, codePoint);
1130     }
1131 
1132     // TODO move this into central utilities
equals(Object a, Object b)1133     public static boolean equals(Object a, Object b) {
1134         return a == b ? true : a == null || b == null ? false : a.equals(b);
1135     }
1136 
1137     // TODO move this into central utilities
showDifferences(Map<K, V> a, Map<K, V> b)1138     private <K, V> String showDifferences(Map<K, V> a, Map<K, V> b) {
1139         StringBuilder result = new StringBuilder();
1140         Set<K> keys = new LinkedHashSet<K>();
1141         keys.addAll(a.keySet());
1142         keys.addAll(b.keySet());
1143         for (K key : keys) {
1144             if (!a.containsKey(key)) {
1145                 result.append(key).append("→‹").append(a.get(key))
1146                     .append("›,∅; ");
1147             } else if (!b.containsKey(key)) {
1148                 result.append(key).append("→∅,‹").append(b.get(key))
1149                     .append("›; ");
1150             } else {
1151                 V aKey = a.get(key);
1152                 V bKey = b.get(key);
1153                 if (!equals(aKey, bKey)) {
1154                     result.append(key).append("→‹").append(a.get(key))
1155                         .append("›,‹").append(b.get(key)).append("›; ");
1156                 }
1157             }
1158         }
1159         return result.toString();
1160     }
1161 
TestLanguageTagParser()1162     public void TestLanguageTagParser() {
1163         LanguageTagParser ltp = new LanguageTagParser();
1164         ltp.set("en-Cyrl-US");
1165         assertEquals(null, "en", ltp.getLanguage());
1166         assertEquals(null, "en_Cyrl", ltp.getLanguageScript());
1167         assertEquals(null, "Cyrl", ltp.getScript());
1168         assertEquals(null, "US", ltp.getRegion());
1169         try {
1170             ltp.set("$");
1171             assertFalse("expected exception", true);
1172         } catch (Exception e) {
1173             logln(e.getMessage());
1174         }
1175     }
1176 }
1177