• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.PrintWriter;
7 import java.util.Arrays;
8 import java.util.BitSet;
9 import java.util.Collection;
10 import java.util.Comparator;
11 import java.util.HashMap;
12 import java.util.HashSet;
13 import java.util.LinkedHashSet;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.Map.Entry;
17 import java.util.Set;
18 import java.util.TreeMap;
19 import java.util.TreeSet;
20 
21 import org.unicode.cldr.draft.FileUtilities;
22 import org.unicode.cldr.draft.ScriptMetadata;
23 import org.unicode.cldr.draft.ScriptMetadata.Info;
24 import org.unicode.cldr.util.Builder;
25 import org.unicode.cldr.util.CLDRConfig;
26 import org.unicode.cldr.util.CLDRFile;
27 import org.unicode.cldr.util.CLDRLocale;
28 import org.unicode.cldr.util.CLDRPaths;
29 import org.unicode.cldr.util.CldrUtility;
30 import org.unicode.cldr.util.Containment;
31 import org.unicode.cldr.util.Counter;
32 import org.unicode.cldr.util.Factory;
33 import org.unicode.cldr.util.Iso639Data;
34 import org.unicode.cldr.util.Iso639Data.Scope;
35 import org.unicode.cldr.util.LanguageTagParser;
36 import org.unicode.cldr.util.LocaleIDParser;
37 import org.unicode.cldr.util.Log;
38 import org.unicode.cldr.util.Organization;
39 import org.unicode.cldr.util.PatternCache;
40 import org.unicode.cldr.util.SimpleFactory;
41 import org.unicode.cldr.util.StandardCodes;
42 import org.unicode.cldr.util.StandardCodes.LstrType;
43 import org.unicode.cldr.util.SupplementalDataInfo;
44 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
45 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
46 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
47 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
48 import org.unicode.cldr.util.Validity;
49 import org.unicode.cldr.util.Validity.Status;
50 
51 import com.google.common.base.Joiner;
52 import com.google.common.collect.ImmutableList;
53 import com.google.common.collect.ImmutableMap;
54 import com.google.common.collect.ImmutableSet;
55 import com.ibm.icu.impl.Relation;
56 import com.ibm.icu.impl.Row;
57 import com.ibm.icu.impl.Row.R2;
58 import com.ibm.icu.impl.Row.R3;
59 import com.ibm.icu.impl.Row.R4;
60 import com.ibm.icu.lang.UScript;
61 import com.ibm.icu.text.Collator;
62 import com.ibm.icu.text.NumberFormat;
63 import com.ibm.icu.text.UTF16;
64 import com.ibm.icu.text.UnicodeSet;
65 import com.ibm.icu.text.UnicodeSetIterator;
66 import com.ibm.icu.util.ULocale;
67 
68 /**
69  * Problems:
70  * "und_Hani", "zh_Hani"
71  * "und_Sinh", "si_Sinh"
72  *
73  * @author markdavis
74  *
75  */
76 public class GenerateMaximalLocales {
77 
78     private static final Map<String, Status> LANGUAGE_CODE_TO_STATUS = Validity.getInstance().getCodeToStatus(LstrType.language);
79 
80     private static final String TEMP_UNKNOWN_REGION = "XZ";
81 
82     private static final String DEBUG_ADD_KEY = "und_Latn_ZA";
83 
84     private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false);
85     private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false);
86     private static final boolean SHOW_CONTAINERS = false;
87 
88     enum OutputStyle {
89         PLAINTEXT, C, C_ALT, XML
90     }
91 
92     private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML")
93         .toUpperCase());
94 
95     // set based on above
96     private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR
97         : "\t";
98     private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_";
99     // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT;
100 
101     private static final boolean tryDifferent = true;
102 
103     private static final File list[] = {
104         new File(CLDRPaths.MAIN_DIRECTORY),
105         new File(CLDRPaths.SEED_DIRECTORY),
106         new File(CLDRPaths.EXEMPLARS_DIRECTORY) };
107 
108     private static Factory factory = SimpleFactory.make(list, ".*");
109     private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory();
110     private static SupplementalDataInfo supplementalData = SupplementalDataInfo
111         .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);
112     private static StandardCodes standardCodes = StandardCodes.make();
113     private static CLDRFile english = factory.make("en", false);
114     static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
115     static {
116         for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) {
117             String region = locale.getCountry();
118             if (region == null || region.isEmpty() || Containment.isLeaf(region)) {
119                 continue;
120             }
cldrContainerToLanguages.put(region, locale.getLanguage())121             cldrContainerToLanguages.put(region, locale.getLanguage());
122         }
cldrContainerToLanguages.freeze()123         cldrContainerToLanguages.freeze();
124         System.out.println("Keep containers " + cldrContainerToLanguages);
125     }
126 
127     private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET");
128     private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr");
129 
130     /**
131      * This is the simplest way to override, by supplying the max value.
132      * It gets a very low weight, so doesn't override any stronger value.
133      */
134     private static final String[] MAX_ADDITIONS = new String[] {
135         "bss_Latn_CM",
136         "gez_Ethi_ET",
137         "ken_Latn_CM",
138         "und_Arab_PK",
139         "wa_Latn_BE",
140 
141         "fub_Arab_CM",
142         "fuf_Latn_GN",
143         "kby_Arab_NE",
144         "kdh_Latn_TG",
145         "apd_Arab_TG",
146         "zlm_Latn_TG",
147 
148         "cr_Cans_CA",
149         "hif_Latn_FJ",
150         "gon_Telu_IN",
151         "lzz_Latn_TR",
152         "lif_Deva_NP",
153         "unx_Beng_IN",
154         "unr_Beng_IN",
155         "ttt_Latn_AZ",
156         "pnt_Grek_GR",
157         "tly_Latn_AZ",
158         "tkr_Latn_AZ",
159         "bsq_Bass_LR",
160         "ccp_Cakm_BD",
161         "blt_Tavt_VN",
162         "rhg_Arab_MM",
163         "rhg_Rohg_MM",
164 
165         "no_Latn_NO",
166         "und_Cpmn_CY",
167     };
168 
169     /**
170      * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS.
171      * However, if you add, add both the language and language+script mappings.
172      */
173     // Many of the overrides below can be removed once the language/pop/country data is updated.
174     private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] {
175         { "cic", "cic_Latn_US" },
176         { "cic_Latn", "cic_Latn_US" },
177         { "eo", "eo_Latn_001" },
178         { "eo_Latn", "eo_Latn_001" },
179         { "es", "es_Latn_ES" },
180         { "es_Latn", "es_Latn_ES" },
181         { "ff_BF", "ff_Latn_BF" },
182         { "ff_GM", "ff_Latn_GM" },
183         { "ff_GH", "ff_Latn_GH" },
184         { "ff_GW", "ff_Latn_GW" },
185         { "ff_LR", "ff_Latn_LR" },
186         { "ff_NE", "ff_Latn_NE" },
187         { "ff_NG", "ff_Latn_NG" },
188         { "ff_SL", "ff_Latn_SL" },
189         { "ff_Adlm", "ff_Adlm_GN" },
190         { "ia", "ia_Latn_001" },
191         { "ia_Latn", "ia_Latn_001" },
192         { "io", "io_Latn_001" },
193         { "io_Latn", "io_Latn_001" },
194         { "jbo", "jbo_Latn_001" },
195         { "jbo_Latn", "jbo_Latn_001" },
196         { "ku_Arab", "ku_Arab_IQ" },
197         { "lrc", "lrc_Arab_IR" },
198         { "lrc_Arab", "lrc_Arab_IR" },
199         { "man", "man_Latn_GM" },
200         { "man_Latn", "man_Latn_GM" },
201         { "mas", "mas_Latn_KE" },
202         { "mas_Latn", "mas_Latn_KE" },
203         { "mn", "mn_Cyrl_MN" },
204         { "mn_Cyrl", "mn_Cyrl_MN" },
205         { "mro", "mro_Mroo_BD" },
206         { "mro_BD", "mro_Mroo_BD" },
207         { "ms_Arab", "ms_Arab_MY" },
208         { "pap", "pap_Latn_AW" },
209         { "pap_Latn", "pap_Latn_AW" },
210         { "prg", "prg_Latn_001" },
211         { "prg_Latn", "prg_Latn_001" },
212         { "rif", "rif_Tfng_MA" },
213         { "rif_Latn", "rif_Latn_MA" },
214         { "rif_Tfng", "rif_Tfng_MA" },
215         { "rif_MA", "rif_Tfng_MA" },
216         { "shi", "shi_Tfng_MA" },
217         { "shi_Tfng", "shi_Tfng_MA" },
218         { "shi_MA", "shi_Tfng_MA" },
219         { "sr_Latn", "sr_Latn_RS" },
220         { "ss", "ss_Latn_ZA" },
221         { "ss_Latn", "ss_Latn_ZA" },
222         { "swc", "swc_Latn_CD" },
223         { "ti", "ti_Ethi_ET" },
224         { "ti_Ethi", "ti_Ethi_ET" },
225         { "und", "en_Latn_US" },
226         { "und_Adlm", "ff_Adlm_GN" },
227         { "und_Adlm_GN", "ff_Adlm_GN" },
228         { "und_Arab", "ar_Arab_EG" },
229         { "und_Arab_PK", "ur_Arab_PK" },
230         { "und_Bopo", "zh_Bopo_TW" },
231         { "und_Deva_FJ", "hif_Deva_FJ" },
232         { "und_EZ", "de_Latn_EZ" },
233         { "und_Hani", "zh_Hani_CN" },
234         { "und_Hani_CN", "zh_Hani_CN" },
235         { "und_Kana", "ja_Kana_JP" },
236         { "und_Kana_JP", "ja_Kana_JP" },
237         { "und_Latn", "en_Latn_US" },
238         { "und_Latn_ET", "en_Latn_ET" },
239         { "und_Latn_NE", "ha_Latn_NE" },
240         { "und_Latn_PH", "fil_Latn_PH" },
241         { "und_ML", "bm_Latn_ML" },
242         { "und_Latn_ML", "bm_Latn_ML" },
243         { "und_MU", "mfe_Latn_MU" },
244         { "und_NE", "ha_Latn_NE" },
245         { "und_PH", "fil_Latn_PH" },
246         { "und_PK", "ur_Arab_PK" },
247         { "und_SO", "so_Latn_SO" },
248         { "und_SS", "en_Latn_SS" },
249         { "und_TK", "tkl_Latn_TK" },
250         { "und_UN", "en_Latn_UN" },
251         { "und_005", "pt_Latn_BR" },
252         { "vo", "vo_Latn_001" },
253         { "vo_Latn", "vo_Latn_001" },
254         { "yi", "yi_Hebr_001" },
255         { "yi_Hebr", "yi_Hebr_001" },
256         { "yue", "yue_Hant_HK" },
257         { "yue_Hant", "yue_Hant_HK" },
258         { "yue_Hans", "yue_Hans_CN" },
259         { "yue_CN", "yue_Hans_CN" },
260         { "zh_Hani", "zh_Hani_CN" },
261 
262         { "zh_Bopo", "zh_Bopo_TW" },
263         { "ccp", "ccp_Cakm_BD" },
264         { "ccp_Cakm", "ccp_Cakm_BD" },
265         { "und_Cakm", "ccp_Cakm_BD" },
266         { "cu_Glag", "cu_Glag_BG" },
267         { "sd_Khoj", "sd_Khoj_IN" },
268         { "lif_Limb", "lif_Limb_IN" },
269         { "grc_Linb", "grc_Linb_GR" },
270         { "arc_Nbat", "arc_Nbat_JO" },
271         { "arc_Palm", "arc_Palm_SY" },
272         { "pal_Phlp", "pal_Phlp_CN" },
273         { "en_Shaw", "en_Shaw_GB" },
274         { "sd_Sind", "sd_Sind_IN" },
275         { "und_Brai", "fr_Brai_FR" }, // hack
276         { "und_Hanb", "zh_Hanb_TW" }, // Special script code
277         { "zh_Hanb", "zh_Hanb_TW" }, // Special script code
278         { "und_Jamo", "ko_Jamo_KR" }, // Special script code
279 
280         //{"und_Cyrl_PL", "be_Cyrl_PL"},
281 
282 //        {"cr", "cr_Cans_CA"},
283 //        {"hif", "hif_Latn_FJ"},
284 //        {"gon", "gon_Telu_IN"},
285 //        {"lzz", "lzz_Latn_TR"},
286 //        {"lif", "lif_Deva_NP"},
287 //        {"unx", "unx_Beng_IN"},
288 //        {"unr", "unr_Beng_IN"},
289 //        {"ttt", "ttt_Latn_AZ"},
290 //        {"pnt", "pnt_Grek_GR"},
291 //        {"tly", "tly_Latn_AZ"},
292 //        {"tkr", "tkr_Latn_AZ"},
293 //        {"bsq", "bsq_Bass_LR"},
294 //        {"ccp", "ccp_Cakm_BD"},
295 //        {"blt", "blt_Tavt_VN"},
296 //        { "mis_Medf", "mis_Medf_NG" },
297 
298         { "ku_Yezi", "ku_Yezi_GE" },
299         { "und_EU", "en_Latn_IE" },
300     });
301 
302     /**
303      * The following supplements the suppress-script. It overrides info from exemplars and the locale info.
304      */
305     private static String[][] SpecialScripts = {
306         { "zh", "Hans" }, // Hans (not Hani)
307         { "yue", "Hant" }, // Hans (not Hani)
308         { "chk", "Latn" }, // Chuukese (Micronesia)
309         { "fil", "Latn" }, // Filipino (Philippines)"
310         { "ko", "Kore" }, // Korean (North Korea)
311         { "ko_KR", "Kore" }, // Korean (North Korea)
312         { "pap", "Latn" }, // Papiamento (Netherlands Antilles)
313         { "pau", "Latn" }, // Palauan (Palau)
314         { "su", "Latn" }, // Sundanese (Indonesia)
315         { "tet", "Latn" }, // Tetum (East Timor)
316         { "tk", "Latn" }, // Turkmen (Turkmenistan)
317         { "ty", "Latn" }, // Tahitian (French Polynesia)
318         { "ja", "Jpan" }, // Special script for japan
319         { "und", "Latn" }, // Ultimate fallback
320     };
321 
322     private static Map<String, String> localeToScriptCache = new TreeMap<>();
323     static {
324         for (String language : standardCodes.getAvailableCodes("language")) {
325             Map<String, String> info = standardCodes.getLangData("language", language);
326             String script = info.get("Suppress-Script");
327             if (script != null) {
localeToScriptCache.put(language, script)328                 localeToScriptCache.put(language, script);
329             }
330         }
331         for (String[] pair : SpecialScripts) {
localeToScriptCache.put(pair[0], pair[1])332             localeToScriptCache.put(pair[0], pair[1]);
333         }
334     }
335 
336     private static Map<String, String> FALLBACK_SCRIPTS;
337     static {
338         LanguageTagParser additionLtp = new LanguageTagParser();
339         Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>();
340         for (String addition : MAX_ADDITIONS) {
341             additionLtp.set(addition);
342             String lan = additionLtp.getLanguage();
_FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())343             _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript());
344         }
345         FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS);
346     }
347 
348     private static int errorCount;
349 
main(String[] args)350     public static void main(String[] args) throws IOException {
351 
352         printDefaultLanguagesAndScripts();
353 
354         Map<String, String> toMaximized = new TreeMap<>();
355 
356         tryDifferentAlgorithm(toMaximized);
357 
358         minimize(toMaximized);
359 
360         // HACK TEMP_UNKNOWN_REGION
361         // this is to get around the removal of items with ZZ in minimize.
362         // probably cleaner way to do it, but this provides control over just those we want to retain.
363         Set<String> toRemove = new TreeSet<>();
364         Map<String, String> toFix = new TreeMap<>();
365         for (Entry<String, String> entry : toMaximized.entrySet()) {
366             String key = entry.getKey();
367             String value = entry.getValue();
368             if (key.contains(TEMP_UNKNOWN_REGION)) {
369                 toRemove.add(key);
370             } else if (value.contains(TEMP_UNKNOWN_REGION)) {
371                 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION));
372             }
373         }
374         for (String key : toRemove) {
375             toMaximized.remove(key);
376         }
377         toMaximized.putAll(toFix);
378 
379         Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags();
380         Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab",
381             "ms_Arab_ID");
382         System.out.println(Joiner.on("\n").join(changes));
383 
384         if (OUTPUT_STYLE == OutputStyle.C_ALT) {
385             doAlt(toMaximized);
386         }
387 
388         if (SHOW_ADD)
389             System.out
390                 .println("/*"
391                     + CldrUtility.LINE_SEPARATOR
392                     + " To Maximize:"
393                     +
394                     CldrUtility.LINE_SEPARATOR
395                     + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing."
396                     +
397                     CldrUtility.LINE_SEPARATOR
398                     + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'."
399                     +
400                     CldrUtility.LINE_SEPARATOR
401                     + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions"
402                     +
403                     CldrUtility.LINE_SEPARATOR
404                     + " Try each of the following in order (where the field exists)"
405                     +
406                     CldrUtility.LINE_SEPARATOR
407                     + "   Lookup language-script-region. If in the table, return the result + variants"
408                     +
409                     CldrUtility.LINE_SEPARATOR
410                     + "   Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants"
411                     +
412                     CldrUtility.LINE_SEPARATOR
413                     + "   Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants"
414                     +
415                     CldrUtility.LINE_SEPARATOR
416                     + "   Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants"
417                     +
418                     CldrUtility.LINE_SEPARATOR
419                     +
420                     CldrUtility.LINE_SEPARATOR
421                     + " Example: Input is zh-ZZZZ-SG."
422                     +
423                     CldrUtility.LINE_SEPARATOR
424                     + " Normalize to zh-SG. Lookup in table. No match."
425                     +
426                     CldrUtility.LINE_SEPARATOR
427                     + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG."
428                     +
429                     CldrUtility.LINE_SEPARATOR
430                     +
431                     CldrUtility.LINE_SEPARATOR
432                     + " To Minimize:"
433                     +
434                     CldrUtility.LINE_SEPARATOR
435                     + " First get max = maximize(input)."
436                     +
437                     CldrUtility.LINE_SEPARATOR
438                     + " Then for trial in {language, language-region, language-script}"
439                     +
440                     CldrUtility.LINE_SEPARATOR
441                     + "     If maximize(trial) == max, then return trial."
442                     +
443                     CldrUtility.LINE_SEPARATOR
444                     + " If you don't get a match, return max."
445                     +
446                     CldrUtility.LINE_SEPARATOR
447                     +
448                     CldrUtility.LINE_SEPARATOR
449                     + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW."
450                     +
451                     CldrUtility.LINE_SEPARATOR
452                     + " zh => zh-Hans-CN. No match, so continue."
453                     +
454                     CldrUtility.LINE_SEPARATOR
455                     + " zh-TW => zh-Hans-TW. Match, so return zh-TW."
456                     +
457                     CldrUtility.LINE_SEPARATOR
458                     +
459                     CldrUtility.LINE_SEPARATOR
460                     + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language."
461                     +
462                     CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() +
463                     CldrUtility.LINE_SEPARATOR + "*/");
464 
465         printLikelySubtags(toMaximized);
466 
467         // if (OUTPUT_STYLE != OutputStyle.XML) {
468         // printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null);
469         // }
470 
471         printDefaultContent(toMaximized);
472 
473         System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR);
474 
475     }
476 
477     static class RowData implements Comparable<RowData> {
478         OfficialStatus os;
479         String name;
480         Long pop;
481 
RowData(OfficialStatus os, String name, Long pop)482         public RowData(OfficialStatus os, String name, Long pop) {
483             this.os = os;
484             this.name = name;
485             this.pop = pop;
486         }
487 
getStatus()488         public OfficialStatus getStatus() {
489             // TODO Auto-generated method stub
490             return os;
491         }
492 
getName()493         public CharSequence getName() {
494             // TODO Auto-generated method stub
495             return name;
496         }
497 
getLiteratePopulation()498         public Long getLiteratePopulation() {
499             // TODO Auto-generated method stub
500             return pop;
501         }
502 
503         @Override
compareTo(RowData o)504         public int compareTo(RowData o) {
505             // TODO Auto-generated method stub
506             int result = os.compareTo(o.os);
507             if (result != 0) return -result;
508             long result2 = pop - o.pop;
509             if (result2 != 0) return result2 < 0 ? 1 : -1;
510             return name.compareTo(o.name);
511         }
512 
513         @Override
equals(Object o)514         public boolean equals(Object o) {
515             return 0 == compareTo((RowData) o);
516         }
517 
518         @Override
hashCode()519         public int hashCode() {
520             throw new UnsupportedOperationException();
521         }
522     }
523 
printDefaultLanguagesAndScripts()524     private static void printDefaultLanguagesAndScripts() {
525 
526         final int minTotalPopulation = 10000000;
527         final int minTerritoryPopulation = 1000000;
528         final double minTerritoryPercent = 1.0 / 3;
529         Map<String, Set<RowData>> languageToReason = new TreeMap<>();
530         Counter<String> languageToLiteratePopulation = new Counter<>();
531         NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH);
532         nf.setGroupingUsed(true);
533         LanguageTagParser ltp = new LanguageTagParser();
534         LikelySubtags likelySubtags = new LikelySubtags();
535         /*
536          * A. X is a qualified language**, and at least one of the following is true:
537          *
538          * 1. X is has official status* in any country
539          * 2. X exceeds a threshold population† of literate users worldwide: 1M
540          * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†.
541          *
542          * B. X is an exception explicitly approved by the committee or X has minimal
543          * language coverage‡ in CLDR itself.
544          * C. The language is in the CLDR-target locales
545          */
546         OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official;
547         Map<String, String> languages = new TreeMap<>();
548         for (String language : standardCodes.getAvailableCodes("language")) {
549             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
550             String result = english.getStringValue(path);
551             if (result != null) {
552                 languages.put(language, result);
553             }
554         }
555         for (String language : languages.keySet()) {
556             System.out.println(language + "\t" + languages.get(language));
557         }
558 
559         // also CLDR-target locales
560         final Set<String> CLDRMainLanguages = new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr));
561 
562         for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
563             PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory);
564             double territoryPopulation = territoryPop.getLiteratePopulation();
565             for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) {
566                 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript,
567                     territory);
568                 ltp.set(languageScript);
569                 String language = ltp.getLanguage();
570 //                if (ltp.getScript().isEmpty()) {
571 //                    String max = likelySubtags.maximize(languageScript);
572 //                    if (max != null) {
573 //                        ltp.set(max).setRegion("");
574 //                        languageScript = ltp.toString();
575 //                    }
576 //                }
577                 boolean add = false;
578                 // #1
579                 OfficialStatus status = popData.getOfficialStatus();
580                 if (status.compareTo(minimalStatus) >= 0) {
581                     add = true;
582                 }
583                 long literatePopulation = getWritingPopulation(popData);
584                 // #2
585                 languageToLiteratePopulation.add(language, literatePopulation);
586                 // #3
587                 if (literatePopulation > minTerritoryPopulation
588                     && literatePopulation > minTerritoryPercent * territoryPopulation) {
589                     add = true;
590                 }
591                 if (add == false && CLDRMainLanguages.contains(language)) {
592                     add = true;
593                 }
594                 if (add) {
595                     add(languageToReason, language, territory, status, literatePopulation);
596                     // Add the containing regions
597                     for (String container : Containment.leafToContainer(territory)) {
598                         add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation);
599                     }
600                 }
601             }
602         }
603         // #2, now that we have the data
604         for (String language : languageToLiteratePopulation.keySet()) {
605             long totalPop = languageToLiteratePopulation.getCount(language);
606             if (totalPop > minTotalPopulation) {
607                 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop);
608             }
609         }
610 
611         // Specials
612         add(languageToReason, "und", "001", OfficialStatus.unknown, 0);
613 
614         // for (String language : Iso639Data.getAvailable()) {
615         // Scope scope = Iso639Data.getScope(language);
616         // Type type = Iso639Data.getType(language);
617         // if (scope == Scope.Special) {
618         // add(languageToReason, language, "001", OfficialStatus.unknown, -1);
619         // }
620         // }
621         // print them
622 
623         System.out.println("Detailed - Including:\t" + languageToReason.size());
624 
625         for (String language : languageToReason.keySet()) {
626             Set<RowData> reasons = languageToReason.get(language);
627 
628             RowData lastReason = reasons.iterator().next();
629 
630             System.out.append(language)
631                 .append("\t")
632                 .append(english.getName(language))
633                 .append("\t")
634                 .append(lastReason.getStatus().toShortString())
635                 .append("\t")
636                 .append(nf.format(languageToLiteratePopulation.getCount(language)));
637             for (RowData reason : reasons) {
638                 String status = reason.getStatus().toShortString();
639                 System.out.append("\t")
640                     .append(status)
641                     .append("-")
642                     .append(reason.getName())
643                     .append("-")
644                     .append(nf.format(reason.getLiteratePopulation()));
645             }
646             System.out.append("\n");
647         }
648 
649         // now list them
650 
651         Set<String> others = new TreeSet<>();
652         others.addAll(standardCodes.getGoodAvailableCodes("language"));
653         others.removeAll(languageToReason.keySet());
654         System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size());
655         showLanguages(languageToReason.keySet(), languageToReason);
656         System.out.println("\nExcluded Languages:\t" + others.size());
657         showLanguages(others, languageToReason);
658     }
659 
getWritingPopulation(PopulationData popData)660     private static long getWritingPopulation(PopulationData popData) {
661         final double writingPopulation = popData.getWritingPopulation();
662         if (!Double.isNaN(writingPopulation)) {
663             return (long) writingPopulation;
664         }
665         return (long) popData.getLiteratePopulation();
666     }
667 
showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)668     private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) {
669         Set<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH));
670         for (String language : others) {
671             sorted.add(getLanguageName(language, languageToReason));
672         }
673         char last = 0;
674         for (String language : sorted) {
675             final char curr = language.charAt(0);
676             if (last != curr) {
677                 System.out.println();
678             } else if (last != '\u0000') {
679                 System.out.print(", ");
680             }
681             System.out.print(language);
682             last = curr;
683         }
684         System.out.println();
685     }
686 
getLanguageName(String language, Map<String, Set<RowData>> languageToReason)687     private static String getLanguageName(String language,
688         Map<String, Set<RowData>> languageToReason) {
689         OfficialStatus best = OfficialStatus.unknown;
690         Set<RowData> reasons = languageToReason.get(language);
691         if (reasons != null) {
692             for (RowData reason : reasons) {
693                 final OfficialStatus currentStatus = reason.getStatus();
694                 if (best.compareTo(currentStatus) < 0) {
695                     best = currentStatus;
696                 }
697             }
698         }
699         String status = best.toShortString();
700         Scope scope = Iso639Data.getScope(language);
701         if (scope == Scope.Special) {
702             status = "S";
703         }
704         String languageFormatted = english.getName(language) + " [" + language + "]-" + status;
705         return languageFormatted;
706     }
707 
add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)708     private static void add(Map<String, Set<RowData>> languageToReason, String language,
709         String territoryRaw, OfficialStatus status, long population) {
710         String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]";
711         Set<RowData> set = languageToReason.get(language);
712         if (set == null) {
713             languageToReason.put(language, set = new TreeSet<>());
714         }
715         set.add(new RowData(status, territory, population));
716     }
717 
718     /**
719      * In computing the defaultContents, no and nb require special handling.
720      */
721     static final Map<String, String> SPECIAL_CHILD_TO_PARENT = ImmutableMap.of("nb", "no", "nb_NO", "nb");
722 
723     /*
724      * Compute the defaultContent values for supplemental data.
725      * It uses the maximization data and the simpleParent (truncation).
726      * We can't use the normal "getParent" because that messes up the logic
727      * used to handle inconsistencies in scripts in CLDR.<br>
728      * That is, there are three situations: <ul>
729      * <li>all children have explicit scripts; </li>
730      * <li>no children have scripts; and </li>
731      * <li>some do and some don't</li></ul>
732      */
733 
printDefaultContent(Map<String, String> toMaximized)734     private static void printDefaultContent(Map<String, String> toMaximized) throws IOException {
735 
736         Set<String> defaultLocaleContent = new TreeSet<>();
737 
738         // go through all the cldr locales, and add default contents
739         // now computed from toMaximized
740         Set<String> available = factory.getAvailable();
741         Relation<String, String> toSimpleChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
742         LanguageTagParser ltp = new LanguageTagParser();
743 
744         // System.out.println(maximize("az_Latn_AZ", toMaximized));
745         Set<String> hasSimpleChildWithScript = new TreeSet<>();
746 
747         // first get a mapping to children
748         for (String locale : available) {
749             if (locale.equals("root")) {
750                 continue;
751             }
752             if (ltp.set(locale).getVariants().size() != 0) {
753                 continue;
754             }
755             String parent = SPECIAL_CHILD_TO_PARENT.get(locale);
756             if (parent == null) {
757                 parent = LocaleIDParser.getSimpleParent(locale); // we can't use the regular getParent (see above)
758             }
759 
760             if (ltp.getScript().length() != 0) {
761                 hasSimpleChildWithScript.add(parent);
762             }
763             if (parent.equals("root")) {
764                 continue;
765             }
766             toSimpleChildren.put(parent, locale);
767         }
768 
769         // Suppress script for locales for which we only have one locale in common/main. See ticket #7834.
770         Set<String> suppressScriptLocales = new HashSet<>(Arrays.asList(
771             "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN",
772             "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE",
773             "blt_VN",
774             "hi_IN",
775             "nv_US",
776             "doi_IN"
777             ));
778 
779         // if any have a script, then throw out any that don't have a script (unless they're specifically included.)
780         Set<String> toRemove = new TreeSet<>();
781         for (String locale : hasSimpleChildWithScript) {
782             toRemove.clear();
783             Set<String> children = toSimpleChildren.getAll(locale);
784             for (String child : children) {
785                 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) {
786                     toRemove.add(child);
787                 }
788             }
789             if (toRemove.size() != 0) {
790                 System.out.println("\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children);
791                 toSimpleChildren.removeAll(locale, toRemove);
792             }
793         }
794 
795         // we add a child as a default locale if it has the same maximization
796         main: for (String locale : toSimpleChildren.keySet()) {
797             String maximized = maximize(locale, toMaximized);
798             if (maximized == null) {
799                 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale);
800                 continue;
801             }
802             Set<String> children = toSimpleChildren.getAll(locale);
803             Map<String, String> debugStuff = new TreeMap<>();
804             for (String child : children) {
805                 String maximizedChild = maximize(child, toMaximized);
806                 if (maximized.equals(maximizedChild)) {
807                     defaultLocaleContent.add(child);
808                     continue main;
809                 }
810                 debugStuff.put(child, maximizedChild);
811             }
812             if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized
813                 + "\tin\t" + debugStuff);
814         }
815 
816         for (String specialChild : SPECIAL_CHILD_TO_PARENT.keySet()) {
817             defaultLocaleContent.add(specialChild);
818         }
819         defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. (old sandbox)
820         defaultLocaleContent.remove("mul_ZZ"); // mul_ZZ isn't ever a real locale.
821 
822         showDefaultContentDifferencesAndFix(defaultLocaleContent);
823 
824         Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml");
825         BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml");
826         CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false);
827 
828         String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t";
829         String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep,
830             PatternCache.get("(\\S)\\S*").matcher(""), 80);
831 
832         Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
833         Log.println("\t\t/>");
834 
835         // Log.println("</supplementalData>");
836         CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching >
837         CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest
838 
839         Log.close();
840         oldFile.close();
841     }
842 
843     // private static void oldAlgorithm(Map<String,String> toMaximized) {
844     // Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales();
845     // LanguageTagParser parser = new LanguageTagParser();
846     // for (String locale : defaultContentLocales) {
847     // String parent = parser.getParent(locale);
848     // toMaximized.put(parent, locale);
849     // if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent");
850     // }
851     //
852     // for (String[] specialCase : SpecialCases) {
853     // toMaximized.put(specialCase[0], specialCase[1]);
854     // if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial");
855     // }
856     //
857     // // recurse and close
858     // closeMapping(toMaximized);
859     //
860     // addScript(toMaximized, parser);
861     //
862     // closeMapping(toMaximized);
863     //
864     // addLanguageScript(toMaximized, parser);
865     //
866     // closeMapping(toMaximized);
867     //
868     // addLanguageCountry(toMaximized, parser);
869     //
870     // closeMapping(toMaximized);
871     //
872     // addCountries(toMaximized);
873     // addScript(toMaximized, parser);
874     // closeMapping(toMaximized);
875     // closeUnd(toMaximized);
876     //
877     // addDeprecated(toMaximized);
878     //
879     // closeMapping(toMaximized);
880     //
881     // checkConsistency(toMaximized);
882     // }
883 
884     private static class MaxData {
885         Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
886         Map<String, Counter<String>> languagesToScripts = new TreeMap<>();
887         Map<String, Counter<String>> languagesToRegions = new TreeMap<>();
888 
889         Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
890         Map<String, Counter<String>> scriptsToLanguages = new TreeMap<>();
891         Map<String, Counter<String>> scriptsToRegions = new TreeMap<>();
892 
893         Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
894         Map<String, Counter<String>> regionsToLanguages = new TreeMap<>();
895         Map<String, Counter<String>> regionsToScripts = new TreeMap<>();
896 
897         Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<>();
898         Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of(
899             new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class);
900 
901         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of(
902             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
903             TreeSet.class);
904         Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of(
905             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
906             TreeSet.class);
907         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of(
908             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
909             TreeSet.class);
910 
911         /**
912          * Add population information. "order" is the negative of the population (makes the first be the highest).
913          * @param language
914          * @param script
915          * @param region
916          * @param order
917          */
add(String language, String script, String region, Double order)918         void add(String language, String script, String region, Double order) {
919             if (SHOW_ADD && language.equals("mis")) {
920                 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order);
921             }
922             languages.put(language, Row.of(order, script, region));
923             // addCounter(languagesToScripts, language, script, order);
924             // addCounter(languagesToRegions, language, region, order);
925 
926             scripts.put(script, Row.of(order, language, region));
927             // addCounter(scriptsToLanguages, script, language, order);
928             // addCounter(scriptsToRegions, script, region, order);
929 
930             regions.put(region, Row.of(order, language, script));
931             // addCounter(regionsToLanguages, region, language, order);
932             // addCounter(regionsToScripts, region, script, order);
933 
934             languageScripts.put(Row.of(language, script), Row.of(order, region));
935             scriptRegions.put(Row.of(script, region), Row.of(order, language));
936             languageRegions.put(Row.of(language, region), Row.of(order, script));
937 
938             Set<String> containerSet = Containment.leafToContainer(region);
939             if (containerSet != null) {
940                 for (String container : containerSet) {
941 
942                     containersToLangRegion.put(container, Row.of(order, language, script, region));
943                     Counter<R2<String, String>> data = containersToLanguage.get(container);
944                     if (data == null) {
945                         containersToLanguage.put(container, data = new Counter<>());
946                     }
947                     data.add(Row.of(language, script), (long) (double) order);
948 
949                 }
950             }
951 
952             if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order);
953         }
954         // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) {
955         // Counter<String> counter = map.get(key);
956         // if (counter == null) {
957         // map.put(key, counter = new Counter<String>());
958         // }
959         // counter.add(key2, count.longValue());
960         // }
961     }
962 
963     private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000;
964     private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20;
965     private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000;
966     private static final double UNOFFICIAL_SCALE_DOWN = 0.2;
967 
968     private static NumberFormat percent = NumberFormat.getPercentInstance();
969     private static NumberFormat number = NumberFormat.getIntegerInstance();
970 
tryDifferentAlgorithm(Map<String, String> toMaximized)971     private static void tryDifferentAlgorithm(Map<String, String> toMaximized) {
972         // we are going to try a different approach.
973         // first gather counts for maximized values
974         // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap();
975         MaxData maxData = new MaxData();
976         Set<String> cldrLocales = factory.getAvailable();
977         Set<String> otherTerritories = new TreeSet<>(standardCodes.getGoodAvailableCodes("territory"));
978 
979         // process all the information to get the top values for each triple.
980         // each of the combinations of 1 or 2 components gets to be a key.
981         for (String region : supplementalData.getTerritoriesWithPopulationData()) {
982             otherTerritories.remove(region);
983             PopulationData regionData = supplementalData.getPopulationDataForTerritory(region);
984             final double literateTerritoryPopulation = regionData.getLiteratePopulation();
985             // we need any unofficial language to meet a certain absolute size requirement and proportion size
986             // requirement.
987             // so the bar is x percent of the population, reset up to y absolute size.
988             double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION;
989             if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) {
990                 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE;
991             }
992 
993             for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) {
994                 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region);
995                 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation();
996                 double order = -literatePopulation; // negative so we get the inverse order
997 
998                 if (data.getOfficialStatus() == OfficialStatus.unknown) {
999                     final String locale = writtenLanguage + "_" + region;
1000                     if (literatePopulation >= minimalLiteratePopulation) {
1001                         // ok, skip
1002                     } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) {
1003                         // ok, skip
1004                     } else {
1005                         // if (SHOW_ADD)
1006                         // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t"
1007                         // + english.getName(locale)
1008                         // + "\t-- too small:\t" + number.format(literatePopulation));
1009                         // continue;
1010                     }
1011                     order *= UNOFFICIAL_SCALE_DOWN;
1012                     if (SHOW_ADD)
1013                         System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t"
1014                             + english.getName(locale)
1015                             + "\t" + number.format(literatePopulation)
1016                             + "\t" + percent.format(literatePopulation / literateTerritoryPopulation)
1017                             + (cldrLocales.contains(locale) ? "\tin-CLDR" : ""));
1018                 }
1019                 String script;
1020                 String language = writtenLanguage;
1021                 final int pos = writtenLanguage.indexOf('_');
1022                 if (pos > 0) {
1023                     language = writtenLanguage.substring(0, pos);
1024                     script = writtenLanguage.substring(pos + 1);
1025                 } else {
1026                     script = getScriptForLocale2(language);
1027                 }
1028                 maxData.add(language, script, region, order);
1029             }
1030         }
1031 
1032         LanguageTagParser additionLtp = new LanguageTagParser();
1033 
1034         for (String addition : MAX_ADDITIONS) {
1035             additionLtp.set(addition);
1036             String lan = additionLtp.getLanguage();
1037             Set<R3<Double, String, String>> key = maxData.languages.get(lan);
1038             if (key == null) {
1039                 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0);
1040             } else {
1041                 int debug = 0;
1042             }
1043         }
1044 
1045         for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) {
1046             String language = entry.getKey();
1047             final Collection<String> values = entry.getValue();
1048             if (values.size() != 1) {
1049                 continue; // skip, no either way
1050             }
1051             Set<R3<Double, String, String>> old = maxData.languages.get(language);
1052             if (!maxData.languages.containsKey(language)) {
1053                 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0);
1054             }
1055         }
1056 
1057         // add others, with English default
1058         for (String region : otherTerritories) {
1059             if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS
1060             maxData.add("en", "Latn", region, 1.0);
1061         }
1062 
1063         // get a reverse mapping, so that we can add the aliases
1064 
1065         Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo()
1066             .get("language");
1067         for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) {
1068             String reason = str.getValue().get1();
1069             if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) {
1070                 continue;
1071             }
1072             List<String> replacements = str.getValue().get0();
1073             if (replacements == null) {
1074                 continue;
1075             }
1076             String goodLanguage = replacements.get(0);
1077 
1078             String badLanguage = str.getKey();
1079             if (badLanguage.contains("_")) {
1080                 continue;
1081             }
1082             if (deprecatedISONotInLST.contains(badLanguage)) {
1083                 continue;
1084             }
1085             Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage);
1086             if (goodLanguageData == null) {
1087                 continue;
1088             }
1089             R3<Double, String, String> value = goodLanguageData.iterator().next();
1090             final String script = value.get1();
1091             final String region = value.get2();
1092             maxData.add(badLanguage, script, region, 1.0);
1093             System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason);
1094         }
1095 
1096         // now, get the best for each one
1097         for (String language : maxData.languages.keySet()) {
1098             R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next();
1099             final Comparable<String> script = value.get1();
1100             final Comparable<String> region = value.get2();
1101             add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", LocaleOverride.REPLACE_EXISTING,
1102                 SHOW_ADD);
1103         }
1104         for (String language : maxData.languagesToScripts.keySet()) {
1105             String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next();
1106             add(language, language + "_" + script, toMaximized, "L->S", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1107         }
1108         for (String language : maxData.languagesToRegions.keySet()) {
1109             String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next();
1110             add(language, language + "_" + region, toMaximized, "L->R", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1111         }
1112 
1113         for (String script : maxData.scripts.keySet()) {
1114             R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next();
1115             final Comparable<String> language = value.get1();
1116             final Comparable<String> region = value.get2();
1117             add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR",
1118                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1119         }
1120         for (String script : maxData.scriptsToLanguages.keySet()) {
1121             String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next();
1122             add("und_" + script, language + "_" + script, toMaximized, "S->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1123         }
1124         for (String script : maxData.scriptsToRegions.keySet()) {
1125             String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next();
1126             add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", LocaleOverride.REPLACE_EXISTING,
1127                 SHOW_ADD);
1128         }
1129 
1130         for (String region : maxData.regions.keySet()) {
1131             R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next();
1132             final Comparable<String> language = value.get1();
1133             final Comparable<String> script = value.get2();
1134             add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS",
1135                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1136         }
1137         for (String region : maxData.regionsToLanguages.keySet()) {
1138             String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next();
1139             add("und_" + region, language + "_" + region, toMaximized, "R->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1140         }
1141         for (String region : maxData.regionsToScripts.keySet()) {
1142             String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next();
1143             add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", LocaleOverride.REPLACE_EXISTING,
1144                 SHOW_ADD);
1145         }
1146 
1147         for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) {
1148             String region = containerAndInfo.getKey();
1149             if (region.equals("001")) {
1150                 continue;
1151             }
1152             Counter<R2<String, String>> data = containerAndInfo.getValue();
1153             Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true);
1154             if (SHOW_CONTAINERS) { // debug
1155                 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null)));
1156                 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region));
1157             }
1158             R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative
1159             final Comparable<String> language = value.get0();
1160             final Comparable<String> script = value.get1();
1161 
1162             // fix special cases like es-419, where a locale exists.
1163             // for those cases, what we add as output is the container. Otherwise the region.
1164             Set<String> skipLanguages = cldrContainerToLanguages.get(region);
1165             if (skipLanguages != null
1166                 && skipLanguages.contains(language)) {
1167                 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS",
1168                     LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1169                 continue;
1170             }
1171 
1172             // we now have the best language and script. Find the best region for that
1173             for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) {
1174                 final Comparable<String> language2 = e.get1();
1175                 final Comparable<String> script2 = e.get2();
1176                 if (language2.equals(language) && script2.equals(script)) {
1177                     add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS",
1178                         LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1179                     break;
1180                 }
1181             }
1182         }
1183 
1184         for (R2<String, String> languageScript : maxData.languageScripts.keySet()) {
1185             R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next();
1186             final Comparable<String> language = languageScript.get0();
1187             final Comparable<String> script = languageScript.get1();
1188             final Comparable<String> region = value.get1();
1189             add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R",
1190                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1191         }
1192 
1193         for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) {
1194             R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next();
1195             final Comparable<String> script = scriptRegion.get0();
1196             final Comparable<String> region = scriptRegion.get1();
1197             final Comparable<String> language = value.get1();
1198             add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L",
1199                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1200         }
1201 
1202         for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) {
1203             R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next();
1204             final Comparable<String> language = languageRegion.get0();
1205             final Comparable<String> region = languageRegion.get1();
1206             final Comparable<String> script = value.get1();
1207             add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S",
1208                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1209         }
1210 
1211         // get the script info from metadata as fallback
1212 
1213 
1214         TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts());
1215         for (String script : sorted) {
1216             Info i = ScriptMetadata.getInfo(script);
1217             String likelyLanguage = i.likelyLanguage;
1218             if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) {
1219                 likelyLanguage = "und";
1220             }
1221             String originCountry = i.originCountry;
1222             final String result = likelyLanguage + "_" + script + "_" + originCountry;
1223             add("und_" + script, result, toMaximized, "S->LR•",
1224                 LocaleOverride.KEEP_EXISTING, SHOW_ADD);
1225             add(likelyLanguage, result, toMaximized, "L->SR•",
1226                 LocaleOverride.KEEP_EXISTING, SHOW_ADD);
1227         }
1228 
1229         // add overrides
1230         for (String key : LANGUAGE_OVERRIDES.keySet()) {
1231             add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", LocaleOverride.REPLACE_EXISTING, true);
1232         }
1233 
1234         // Make sure that the mapping is Idempotent. If we have A ==> B, we must never have B ==> C
1235         // We run this check until we get no problems.
1236         Set<List<String>> problems = new HashSet<>();
1237 
1238         while (true) {
1239             problems.clear();
1240             for (Entry<String, String> entry : toMaximized.entrySet()) {
1241                 String source = entry.getKey();
1242                 String target = entry.getValue();
1243                 if (target.contains("_Zzzz") || target.contains("_ZZ")) { // these are special cases
1244                     continue;
1245                 }
1246                 String idempotentCandidate = LikelySubtags.maximize(target, toMaximized);
1247 
1248                 if (idempotentCandidate == null) {
1249                     System.out.println("Can't maximize " + target);
1250                 } else if (!idempotentCandidate.equals(target)) {
1251                     problems.add(ImmutableList.of(source, target, idempotentCandidate));
1252                 }
1253             }
1254             if (problems.isEmpty()) {
1255                 break;
1256             }
1257             for (List<String> row : problems) {
1258                 System.out.println("Idempotence: dropping mapping " + row.get(0) + " to " + row.get(1) + " since the target maps further to " + row.get(2));
1259                 toMaximized.remove(row.get(0));
1260             }
1261         }
1262     }
1263 
shorten(Object data)1264     public static String shorten(Object data) {
1265         String info = data.toString();
1266         if (info.length() > 255) {
1267             info = info.substring(0, 127) + "…";
1268         }
1269         return info;
1270     }
1271 
doAlt(Map<String, String> toMaximized)1272     private static void doAlt(Map<String, String> toMaximized) {
1273         // TODO Auto-generated method stub
1274         Map<String, String> temp = new TreeMap<>();
1275         for (String locale : toMaximized.keySet()) {
1276             String target = toMaximized.get(locale);
1277             temp.put(toAlt(locale, true), toAlt(target, true));
1278         }
1279         toMaximized.clear();
1280         toMaximized.putAll(temp);
1281     }
1282 
maximize(String languageTag, Map<String, String> toMaximized)1283     public static String maximize(String languageTag, Map<String, String> toMaximized) {
1284         LanguageTagParser ltp = new LanguageTagParser();
1285 
1286         // clean up the input by removing Zzzz, ZZ, and changing "" into und.
1287         ltp.set(languageTag);
1288         String language = ltp.getLanguage();
1289         String region = ltp.getRegion();
1290         String script = ltp.getScript();
1291         boolean changed = false;
1292         if (language.equals("")) {
1293             ltp.setLanguage(language = "und");
1294             changed = true;
1295         }
1296         if (region.equals(UNKNOWN_SCRIPT)) {
1297             ltp.setScript(script = "");
1298             changed = true;
1299         }
1300         if (ltp.getRegion().equals(UNKNOWN_REGION)) {
1301             ltp.setRegion(region = "");
1302             changed = true;
1303         }
1304         if (changed) {
1305             languageTag = ltp.toString();
1306         }
1307         // check whole
1308         String result = toMaximized.get(languageTag);
1309         if (result != null) {
1310             return result;
1311         }
1312         // try empty region
1313         if (region.length() != 0) {
1314             result = toMaximized.get(ltp.setRegion("").toString());
1315             if (result != null) {
1316                 return ltp.set(result).setRegion(region).toString();
1317             }
1318             ltp.setRegion(region); // restore
1319         }
1320         // try empty script
1321         if (script.length() != 0) {
1322             result = toMaximized.get(ltp.setScript("").toString());
1323             if (result != null) {
1324                 return ltp.set(result).setScript(script).toString();
1325             }
1326             // try empty script and region
1327             if (region.length() != 0) {
1328                 result = toMaximized.get(ltp.setRegion("").toString());
1329                 if (result != null) {
1330                     return ltp.set(result).setScript(script).setRegion(region).toString();
1331                 }
1332             }
1333         }
1334         if (!language.equals("und") && script.length() != 0 && region.length() != 0) {
1335             return languageTag; // it was ok, and we couldn't do anything with it
1336         }
1337         return null; // couldn't maximize
1338     }
1339 
minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1340     public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) {
1341         if (input.equals("nb_Latn_SJ")) {
1342             System.out.print(""); // debug
1343         }
1344         String maximized = maximize(input, toMaximized);
1345         if (maximized == null) {
1346             return null; // failed
1347         }
1348         LanguageTagParser ltp = new LanguageTagParser().set(maximized);
1349         String language = ltp.getLanguage();
1350         String region = ltp.getRegion();
1351         String script = ltp.getScript();
1352         // try building up from shorter to longer, and find the first that matches
1353         // could be more optimized, but for this code we want simplest
1354         String[] trials = { language,
1355             language + TAG_SEPARATOR + (favorRegion ? region : script),
1356             language + TAG_SEPARATOR + (!favorRegion ? region : script) };
1357         for (String trial : trials) {
1358             String newMaximized = maximize(trial, toMaximized);
1359             if (maximized.equals(newMaximized)) {
1360                 return trial;
1361             }
1362         }
1363         return maximized;
1364     }
1365 
1366     // /**
1367     // * Verify that we can map from each language, script, and country to something.
1368     // * @param toMaximized
1369     // */
1370     // private static void checkConsistency(Map<String, String> toMaximized) {
1371     // Map<String,String> needMappings = new TreeMap();
1372     // LanguageTagParser parser = new LanguageTagParser();
1373     // for (String maximized : new TreeSet<String>(toMaximized.values())) {
1374     // parser.set(maximized);
1375     // final String language = parser.getLanguage();
1376     // final String script = parser.getScript();
1377     // final String region = parser.getRegion();
1378     // if (language.length() == 0 || script.length() == 0 || region.length() == 0) {
1379     // failure("   { \"" + maximized + "\", \"" + maximized + "\" },   //     " + english.getName(maximized) +
1380     // "\t\tFailed-Consistency");
1381     // continue;
1382     // }
1383     // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency");
1384     // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency");
1385     // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1386     // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency");
1387     // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1388     // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency");
1389     // }
1390     // toMaximized.putAll(needMappings);
1391     // }
1392 
1393     // private static void failure(String string) {
1394     // System.out.println(string);
1395     // errorCount++;
1396     // }
1397 
1398     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String>
1399     // otherToCheck, String kind) {
1400     // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind);
1401     // }
1402 
1403     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey,
1404     // Set<String> skipValue, String kind) {
1405     // if (!key.equals(value)
1406     // && !toAdd.containsKey(key)
1407     // && (skipKey == null || !skipKey.contains(key))
1408     // && (skipValue == null || !skipValue.contains(value))) {
1409     // add(key, value, toAdd, kind);
1410     // }
1411     // }
1412 
1413     enum LocaleOverride {
1414         KEEP_EXISTING, REPLACE_EXISTING
1415     }
1416 
add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, boolean showAction)1417     private static void add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override,
1418         boolean showAction) {
1419         if (SHOW_ADD && key.startsWith("mis")) {
1420             int debug = 1;
1421         }
1422         if (key.equals(DEBUG_ADD_KEY)) {
1423             System.out.println("*debug*");
1424         }
1425         String oldValue = toAdd.get(key);
1426         if (oldValue == null) {
1427             if (showAction) {
1428                 System.out.println("\tAdding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind);
1429             }
1430         } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) {
1431             // if (showAction) {
1432             // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind);
1433             // }
1434             return;
1435         } else {
1436             if (showAction) {
1437                 System.out.println("\tReplacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind);
1438             }
1439         }
1440         toAdd.put(key, value);
1441     }
1442 
getName(String value)1443     private static String getName(String value) {
1444         return ConvertLanguageData.getLanguageCodeAndName(value);
1445     }
1446 
1447     // private static void addCountries(Map<String, String> toMaximized) {
1448     // Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap();
1449     //
1450     // for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
1451     // Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory);
1452     // String biggestOfficial = null;
1453     // double biggest = -1;
1454     // for (String language : languages) {
1455     // PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory);
1456     // // add to info about script
1457     //
1458     // String script = getScriptForLocale(language);
1459     // if (script != null) {
1460     // Map<String, Double> languageInfo = scriptToLanguageToSize.get(script);
1461     // if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap());
1462     // String baseLanguage = language;
1463     // int pos = baseLanguage.indexOf('_');
1464     // if (pos >= 0) {
1465     // baseLanguage = baseLanguage.substring(0,pos);
1466     // }
1467     // Double size = languageInfo.get(baseLanguage);
1468     // languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation());
1469     // }
1470     //
1471     //
1472     // final OfficialStatus officialStatus = info.getOfficialStatus();
1473     // if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) {
1474     // double size2 = info.getLiteratePopulation();
1475     // if (biggest < size2) {
1476     // biggest = size2;
1477     // biggestOfficial = language;
1478     // }
1479     // }
1480     // }
1481     // if (biggestOfficial != null) {
1482     // final String replacementTag = "und_" + territory;
1483     // String maximized = biggestOfficial + "_" + territory;
1484     // toMaximized.put(replacementTag, maximized);
1485     // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory");
1486     // }
1487     // }
1488     //
1489     // for (String script : scriptToLanguageToSize.keySet()) {
1490     // String biggestOfficial = null;
1491     // double biggest = -1;
1492     //
1493     // final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script);
1494     // for (String language : languageToSize.keySet()) {
1495     // double size = languageToSize.get(language);
1496     // if (biggest < size) {
1497     // biggest = size;
1498     // biggestOfficial = language;
1499     // }
1500     // }
1501     // if (biggestOfficial != null) {
1502     // final String replacementTag = "und_" + script;
1503     // String maximized = biggestOfficial + "_" + script;
1504     // toMaximized.put(replacementTag, maximized);
1505     // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script");
1506     // }
1507     // }
1508     // }
1509 
1510     // private static void closeUnd(Map<String, String> toMaximized) {
1511     // Map<String,String> toAdd = new TreeMap<String,String>();
1512     // for (String oldSource : toMaximized.keySet()) {
1513     // String maximized = toMaximized.get(oldSource);
1514     // if (!maximized.startsWith("und")) {
1515     // int pos = maximized.indexOf("_");
1516     // if (pos >= 0) {
1517     // addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd");
1518     // }
1519     // }
1520     // }
1521     // toMaximized.putAll(toAdd);
1522     // }
1523 
1524     /**
1525      * Generate tags where the deprecated values map to the expanded values
1526      *
1527      * @param toMaximized
1528      */
1529     // private static void addDeprecated(Map<String, String> toMaximized) {
1530     // Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo();
1531     // LanguageTagParser temp = new LanguageTagParser();
1532     // LanguageTagParser tagParsed = new LanguageTagParser();
1533     // LanguageTagParser replacementParsed = new LanguageTagParser();
1534     // Map<String,String> toAdd = new TreeMap<String,String>();
1535     // while (true) {
1536     // toAdd.clear();
1537     // for (String type : typeToTagToReplacement.keySet()) {
1538     // if (type.equals("variant") || type.equals("zone")) continue;
1539     // boolean addUnd = !type.equals("language");
1540     //
1541     // Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type);
1542     // System.out.println("*" + type + " = " + tagToReplacement);
1543     //
1544     // for (String tag: tagToReplacement.keySet()) {
1545     //
1546     // final List<String> list = tagToReplacement.get(tag);
1547     // if (list == null) continue; // we don't have any information
1548     // String replacement = list.get(0);
1549     //
1550     // // only do multiples
1551     // if (tag.contains("_") || !replacement.contains("_")) {
1552     // continue;
1553     // }
1554     //
1555     // // we now have a tag and a replacement value
1556     // // make parsers that we can use
1557     // try {
1558     // tagParsed.set(addUnd ? "und-" + tag : tag);
1559     // replacementParsed.set(addUnd ? "und-" + replacement : replacement);
1560     // } catch (RuntimeException e) {
1561     // continue;
1562     // }
1563     // addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated");
1564     //
1565     // for (String locale : toMaximized.keySet()) {
1566     // String maximized = toMaximized.get(locale);
1567     // addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized);
1568     // addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized);
1569     // }
1570     // }
1571     // }
1572     // if (toAdd.size() == 0) {
1573     // break;
1574     // }
1575     // toMaximized.putAll(toAdd);
1576     // }
1577     // }
1578 
1579     // private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed,
1580     // LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) {
1581     // if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) {
1582     // return;
1583     // }
1584     // if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) {
1585     // return;
1586     // }
1587     // if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) {
1588     // return;
1589     // }
1590     // if (!replacementParsed.getLanguage().equals("und")) {
1591     // locale.setLanguage(replacementParsed.getLanguage());
1592     // }
1593     // if (!replacementParsed.getScript().equals("")) {
1594     // locale.setScript(replacementParsed.getScript());
1595     // }
1596     // if (!replacementParsed.getRegion().equals("")) {
1597     // locale.setRegion(replacementParsed.getRegion());
1598     // }
1599     // addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated");
1600     // }
1601 
1602     // private static int getSubtagPosition(String locale, String subtags) {
1603     // int pos = -1;
1604     // while (true) {
1605     // pos = locale.indexOf(subtags, pos + 1);
1606     // if (pos < 0) return -1;
1607     // // make sure boundaries are ok
1608     // if (pos != 0) {
1609     // char charBefore = locale.charAt(pos-1);
1610     // if (charBefore != '_' && charBefore != '_') return -1;
1611     // }
1612     // int limit = pos + subtags.length();
1613     // if (limit != locale.length()) {
1614     // char charAfter = locale.charAt(limit);
1615     // if (charAfter != '_' && charAfter != '_') return -1;
1616     // }
1617     // return pos;
1618     // }
1619     // }
1620 
1621     /*
1622      * Format
1623      * const DefaultSubtags default_subtags[] = {
1624      * {
1625      * // Afar => Afar (Latin, Ethiopia)
1626      * "aa",
1627      * "aa_Latn_ET"
1628      * },{
1629      * // Afrikaans => Afrikaans (Latin, South Africa)
1630      * "af",
1631      * "af_Latn_ZA"
1632      * },{
1633      */
1634 
printLikelySubtags(Map<String, String> fluffup)1635     private static void printLikelySubtags(Map<String, String> fluffup) throws IOException {
1636 
1637         PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY,
1638             "/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt"));
1639         String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " ";
1640         String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {"
1641             : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR
1642                 + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">"
1643                 + CldrUtility.LINE_SEPARATOR
1644                 + "<!--"
1645                 + CldrUtility.LINE_SEPARATOR
1646                 + CldrUtility.getCopyrightString()
1647                 + CldrUtility.LINE_SEPARATOR
1648                 + "-->"
1649                 + CldrUtility.LINE_SEPARATOR
1650                 + "<!--"
1651                 + CldrUtility.LINE_SEPARATOR
1652                 + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR
1653                 + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR
1654                 + "not be patched by hand, as any changes made in that fashion may be lost."
1655                 + CldrUtility.LINE_SEPARATOR
1656                 + "-->"
1657                 + CldrUtility.LINE_SEPARATOR
1658                 + "<supplementalData>" + CldrUtility.LINE_SEPARATOR
1659                 + "    <version number=\"$" +
1660                 "Revision$\"/>" + CldrUtility.LINE_SEPARATOR
1661                 + "    <likelySubtags>";
1662         String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};"
1663             : "    </likelySubtags>" + CldrUtility.LINE_SEPARATOR
1664                 + "</supplementalData>";
1665         out.println(header);
1666         boolean first = true;
1667         Set<String> keys = new TreeSet<>(new LocaleStringComparator());
1668         keys.addAll(fluffup.keySet());
1669         for (String printingLocale : keys) {
1670             String printingTarget = fluffup.get(printingLocale);
1671             String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing
1672                 + printingName(printingTarget, spacing);
1673 
1674             if (OUTPUT_STYLE == OutputStyle.XML) {
1675                 out.println("\t\t<likelySubtag from=\"" + printingLocale +
1676                     "\" to=\"" + printingTarget + "\"" +
1677                     "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->");
1678             } else {
1679                 if (first) {
1680                     first = false;
1681                 } else {
1682                     out.print(",");
1683                 }
1684                 if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) {
1685                     comment = printingName(printingLocale, spacing) + SEPARATOR + "    // " + spacing + "=>" + spacing
1686                         + printingName(printingTarget, spacing);
1687                 }
1688                 out.print(
1689                     "  {"
1690                         + SEPARATOR + "    // " + comment
1691                         + SEPARATOR + "    \"" + printingLocale + "\","
1692                         + SEPARATOR + "    \"" + printingTarget + "\""
1693                         + CldrUtility.LINE_SEPARATOR + "  }");
1694             }
1695         }
1696         out.println(footer);
1697         out.close();
1698     }
1699 
printingName(String locale, String spacing)1700     public static String printingName(String locale, String spacing) {
1701         if (locale == null) {
1702             return null;
1703         }
1704         LanguageTagParser parser = new LanguageTagParser().set(locale);
1705         String lang = parser.getLanguage();
1706         String script = parser.getScript();
1707         String region = parser.getRegion();
1708         return "{" + spacing +
1709             (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing +
1710             (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing
1711             +
1712             (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing
1713             + "}";
1714     }
1715 
1716     private static final String[][] ALT_REVERSAL = {
1717         //{ "no", "nb" },
1718         //{ "nb", "no" },
1719         { "he", "iw" },
1720         { "iw", "he" },
1721     };
1722 
toAlt(String locale, boolean change)1723     public static String toAlt(String locale, boolean change) {
1724         if (!change || locale == null) {
1725             return locale;
1726         }
1727         String firstTag = getFirstTag(locale);
1728         for (String[] pair : ALT_REVERSAL) {
1729             if (firstTag.equals(pair[0])) {
1730                 locale = pair[1] + locale.substring(pair[1].length());
1731                 break;
1732             }
1733         }
1734         locale = locale.replace("_", "-");
1735         return locale;
1736     }
1737 
getFirstTag(String locale)1738     private static String getFirstTag(String locale) {
1739         int pos = locale.indexOf('_');
1740         return pos < 0 ? locale : locale.substring(0, pos);
1741     }
1742 
1743     // private static Map<String, String> getBackMapping(Map<String, String> fluffup) {
1744     // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR);
1745     // for (String source : fluffup.keySet()) {
1746     // if (source.startsWith("und")) {
1747     // continue;
1748     // }
1749     // String maximized = fluffup.get(source);
1750     // backMap.put(maximized, source); // put in right order
1751     // }
1752     // Map<String,String> returnBackMap = new TreeMap();
1753     // for (String maximized : backMap.keySet()) {
1754     // final Set<String> all = backMap.getAll(maximized);
1755     // final String minimized = all.iterator().next();
1756     // returnBackMap.put(maximized, minimized);
1757     // }
1758     // return returnBackMap;
1759     // }
1760 
1761     /**
1762      * Language tags are presumed to share the first language, except possibly "und". Best is least
1763      */
1764     // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() {
1765     // LanguageTagParser p1 = new LanguageTagParser();
1766     // LanguageTagParser p2 = new LanguageTagParser();
1767     // public int compare(String o1, String o2) {
1768     // if (o1.equals(o2)) return 0;
1769     // p1.set(o1);
1770     // p2.set(o2);
1771     // String lang1 = p1.getLanguage();
1772     // String lang2 = p2.getLanguage();
1773     //
1774     // // compare languages first
1775     // // put und at the end
1776     // int result = lang1.compareTo(lang2);
1777     // if (result != 0) {
1778     // if (lang1.equals("und")) return 1;
1779     // if (lang2.equals("und")) return -1;
1780     // return result;
1781     // }
1782     //
1783     // // now scripts and regions.
1784     // // if they have different numbers of fields, the shorter wins.
1785     // // If there are two fields, region is lowest.
1786     // // The simplest way is to just compare scripts first
1787     // // so zh-TW < zh-Hant, because we first compare "" to Hant
1788     // String script1 = p1.getScript();
1789     // String script2 = p2.getScript();
1790     // int scriptOrder = script1.compareTo(script2);
1791     // if (scriptOrder != 0) return scriptOrder;
1792     //
1793     // String region1 = p1.getRegion();
1794     // String region2 = p2.getRegion();
1795     // int regionOrder = region1.compareTo(region2);
1796     // if (regionOrder != 0) return regionOrder;
1797     //
1798     // return o1.compareTo(o2);
1799     // }
1800     //
1801     // };
1802 
minimize(Map<String, String> fluffup)1803     public static void minimize(Map<String, String> fluffup) {
1804         LanguageTagParser parser = new LanguageTagParser();
1805         LanguageTagParser targetParser = new LanguageTagParser();
1806         Set<String> removals = new TreeSet<>();
1807         while (true) {
1808             removals.clear();
1809             for (String locale : fluffup.keySet()) {
1810                 String target = fluffup.get(locale);
1811                 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) {
1812                     removals.add(locale);
1813                     if (SHOW_ADD)
1814                         System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1815                             + "\t\t - Unknown Region in target");
1816                     continue;
1817                 }
1818                 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) {
1819                     removals.add(locale);
1820                     if (SHOW_ADD)
1821                         System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1822                             + "\t\t - Unknown Script in target");
1823                     continue;
1824                 }
1825 
1826                 String region = parser.set(locale).getRegion();
1827                 if (region.length() != 0) {
1828                     if (region.equals(UNKNOWN_REGION)) {
1829                         removals.add(locale);
1830                         if (SHOW_ADD)
1831                             System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1832                                 + "\t\t - Unknown Region in source");
1833                         continue;
1834                     }
1835                     parser.setRegion("");
1836                     String newLocale = parser.toString();
1837                     String newTarget = fluffup.get(newLocale);
1838                     if (newTarget != null) {
1839                         newTarget = targetParser.set(newTarget).setRegion(region).toString();
1840                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
1841                             removals.add(locale);
1842                             if (SHOW_ADD)
1843                                 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
1844                                     + newLocale);
1845                             continue;
1846                         }
1847                     }
1848                 }
1849                 String script = parser.set(locale).getScript();
1850                 if (locale.equals(DEBUG_ADD_KEY)) {
1851                     System.out.println("*debug*");
1852                 }
1853                 if (script.length() != 0) {
1854                     if (script.equals(UNKNOWN_SCRIPT)) {
1855                         removals.add(locale);
1856                         if (SHOW_ADD)
1857                             System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script");
1858                         continue;
1859                     }
1860                     parser.setScript("");
1861                     String newLocale = parser.toString();
1862                     String newTarget = fluffup.get(newLocale);
1863                     if (newTarget != null) {
1864                         newTarget = targetParser.set(newTarget).setScript(script).toString();
1865                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
1866                             removals.add(locale);
1867                             if (SHOW_ADD)
1868                                 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
1869                                     + newLocale);
1870                             continue;
1871                         }
1872                     }
1873                 }
1874             }
1875             if (removals.size() == 0) {
1876                 break;
1877             }
1878             for (String locale : removals) {
1879                 fluffup.remove(locale);
1880             }
1881         }
1882     }
1883 
1884     // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) {
1885     // // add script
1886     // Map<String, String> temp = new TreeMap<String, String>();
1887     // while (true) {
1888     // temp.clear();
1889     // for (String target : new TreeSet<String>(fluffup.values())) {
1890     // parser.set(target);
1891     // final String territory = parser.getRegion();
1892     // if (territory.length() == 0) {
1893     // continue;
1894     // }
1895     // parser.setRegion("");
1896     // String possibleSource = parser.toString();
1897     // if (fluffup.containsKey(possibleSource)) {
1898     // continue;
1899     // }
1900     // String other = temp.get(possibleSource);
1901     // if (other != null) {
1902     // if (!target.equals(other)) {
1903     // System.out.println("**Failure with multiple sources in addLanguageScript: "
1904     // + possibleSource + "\t=>\t" + target + ", " + other);
1905     // }
1906     // continue;
1907     // }
1908     // temp.put(possibleSource, target);
1909     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script");
1910     // }
1911     // if (temp.size() == 0) {
1912     // break;
1913     // }
1914     // fluffup.putAll(temp);
1915     // }
1916     //
1917     // }
1918 
1919     // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) {
1920     // // add script
1921     // Map<String, String> temp = new TreeMap<String, String>();
1922     // while (true) {
1923     // temp.clear();
1924     // for (String target : new TreeSet<String>(fluffup.values())) {
1925     // parser.set(target);
1926     // String script = parser.getScript();
1927     // if (script.length() == 0) {
1928     // continue;
1929     // }
1930     // parser.setScript("");
1931     // String possibleSource = parser.toString();
1932     // if (fluffup.containsKey(possibleSource)) {
1933     // continue;
1934     // }
1935     // String other = temp.get(possibleSource);
1936     //
1937     // if (other != null) {
1938     // if (!target.equals(other)) {
1939     // script = getScriptForLocale(possibleSource);
1940     // if (script == null) {
1941     // System.out.println("**Failure with multiple sources in addLanguageCountry: "
1942     // + possibleSource + "\t=>\t" + target + ", " + other);
1943     // continue; // error message in routine
1944     // }
1945     // parser.setScript(script);
1946     // target = parser.toString();
1947     // }
1948     // }
1949     //
1950     // temp.put(possibleSource, target);
1951     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry");
1952     // }
1953     // if (temp.size() == 0) {
1954     // break;
1955     // }
1956     // fluffup.putAll(temp);
1957     // }
1958     //
1959     // }
1960 
1961     // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) {
1962     // // add script
1963     // Map<String, String> temp = new TreeMap<String, String>();
1964     // while (true) {
1965     // temp.clear();
1966     // Set skipTarget = fluffup.keySet();
1967     // for (String locale : fluffup.keySet()) {
1968     // String target = fluffup.get(locale);
1969     // parser.set(target);
1970     // if (parser.getScript().length() != 0) {
1971     // continue;
1972     // }
1973     // String script = getScriptForLocale(target);
1974     //
1975     // if (script == null) {
1976     // continue; // error message in routine
1977     // }
1978     // parser.setScript(script);
1979     // String furtherTarget = parser.toString();
1980     // addIfNotIn(target, furtherTarget, temp, fluffup, "Script");
1981     // }
1982     // if (temp.size() == 0) {
1983     // break;
1984     // }
1985     // fluffup.putAll(temp);
1986     // }
1987     // }
1988 
1989     // private static String getScriptForLocale(String locale) {
1990     // String result = getScriptForLocale2(locale);
1991     // if (result != null) return result;
1992     // int pos = locale.indexOf('_');
1993     // if (pos >= 0) {
1994     // result = getScriptForLocale2(locale.substring(0,pos));
1995     // }
1996     // return result;
1997     // }
1998 
1999     private static String UNKNOWN_SCRIPT = "Zzzz";
2000     private static String UNKNOWN_REGION = "ZZ";
2001 
getScriptForLocale2(String locale)2002     private static String getScriptForLocale2(String locale) {
2003         String result = localeToScriptCache.get(locale);
2004         if (result != null) {
2005             return result;
2006         }
2007         if (locale.equals("ky")) {
2008             int debug = 0;
2009         }
2010         try {
2011             Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale);
2012             if (data != null) {
2013                 for (BasicLanguageData datum : data.values()) {
2014                     final Set<String> scripts = datum.getScripts();
2015                     boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary;
2016                     if (scripts.size() != 1) {
2017                         if (scripts.size() > 1 && isPrimary) {
2018                             break;
2019                         }
2020                         continue;
2021                     }
2022                     String script = scripts.iterator().next();
2023                     if (isPrimary) {
2024                         return result = script;
2025                     } else if (result == null) {
2026                         result = script;
2027                     }
2028                 }
2029                 if (result != null) {
2030                     return result;
2031                 }
2032             }
2033             CLDRFile cldrFile;
2034             try {
2035                 cldrFile = factory.make(locale, true);
2036             } catch (RuntimeException e) {
2037                 result = FALLBACK_SCRIPTS.get(locale);
2038                 if (result == null) {
2039                     System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale));
2040                     return result = UNKNOWN_SCRIPT;
2041                 } else {
2042                     return result;
2043                 }
2044             }
2045             UnicodeSet exemplars = getExemplarSet(cldrFile, "");
2046             Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars);
2047             CLDRScripts.remove(UNKNOWN_SCRIPT);
2048             if (CLDRScripts.size() == 1) {
2049                 return result = CLDRScripts.iterator().next();
2050             } else if (CLDRScripts.size() == 0) {
2051                 System.out.println("**Failed to get script for:\t" + locale);
2052                 return result = UNKNOWN_SCRIPT;
2053             } else {
2054                 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts);
2055                 return result = UNKNOWN_SCRIPT;
2056             }
2057         } finally {
2058             if (result.equals(UNKNOWN_SCRIPT)) {
2059                 String temp = LANGUAGE_OVERRIDES.get(locale);
2060                 if (temp != null) {
2061                     result = new LanguageTagParser().set(temp).getScript();
2062                     System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result);
2063                 }
2064             }
2065             localeToScriptCache.put(locale, result);
2066             if (SHOW_ADD)
2067                 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t"
2068                     + english.getName(CLDRFile.SCRIPT_NAME, result));
2069         }
2070     }
2071 
2072     // private static Map<String, String> closeMapping(Map<String, String> fluffup) {
2073     // if (SHOW_ADD) System.out.flush();
2074     // Map<String,String> temp = new TreeMap<String,String>();
2075     // while (true) {
2076     // temp.clear();
2077     // for (String locale : fluffup.keySet()) {
2078     // String target = fluffup.get(locale);
2079     // if (target.equals("si_Sinh") || target.equals("zh-Hani")) {
2080     // System.out.println("????");
2081     // }
2082     // String furtherTarget = fluffup.get(target);
2083     // if (furtherTarget == null) {
2084     // continue;
2085     // }
2086     // addIfNotIn(locale, furtherTarget, temp, null, "Close");
2087     // }
2088     // if (temp.size() == 0) {
2089     // break;
2090     // }
2091     // fluffup.putAll(temp);
2092     // }
2093     // if (SHOW_ADD) System.out.flush();
2094     // return temp;
2095     // }
2096 
getScriptsFromUnicodeSet(UnicodeSet exemplars)2097     public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) {
2098         // use bits first, since that's faster
2099         BitSet scriptBits = new BitSet();
2100         boolean show = false;
2101         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
2102             if (show)
2103                 System.out.println(Integer.toHexString(it.codepoint));
2104             if (it.codepoint != UnicodeSetIterator.IS_STRING) {
2105                 scriptBits.set(UScript.getScript(it.codepoint));
2106             } else {
2107                 int cp;
2108                 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) {
2109                     scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i)));
2110                 }
2111             }
2112         }
2113         scriptBits.clear(UScript.COMMON);
2114         scriptBits.clear(UScript.INHERITED);
2115         Set<String> scripts = new TreeSet<>();
2116         for (int j = 0; j < scriptBits.size(); ++j) {
2117             if (scriptBits.get(j)) {
2118                 scripts.add(UScript.getShortName(j));
2119             }
2120         }
2121         return scripts;
2122     }
2123 
getExemplarSet(CLDRFile cldrfile, String type)2124     public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) {
2125         if (type.length() != 0)
2126             type = "[@type=\"" + type + "\"]";
2127         String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters"
2128             + type);
2129         if (v == null)
2130             return new UnicodeSet();
2131         return new UnicodeSet(v);
2132     }
2133 
2134     // private static String[][] SpecialCases = {
2135     // { "zh_Hani", "zh_Hans_CN"},
2136     // { "si_Sinh", "si_Sinh_LK"},
2137     // { "ii", "ii_CN"}, // Sichuan Yi (Yi)
2138     // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics)
2139     // { "und", "en"}, // English default
2140     // };
2141 
showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)2142     static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) {
2143         Set<String> errors = new LinkedHashSet<>();
2144         Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(
2145             ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors);
2146         if (!errors.isEmpty()) {
2147             System.out.println(Joiner.on("\n").join(errors));
2148             errors.clear();
2149         }
2150         Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
2151             new TreeMap<String, String>(), errors);
2152         if (!errors.isEmpty()) {
2153             System.out.println("Default Content errors: " + Joiner.on("\n").join(errors));
2154             errors.clear();
2155         }
2156         Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent,
2157             "ar", "ar_001");
2158         System.out.println(Joiner.on("\n").join(changes));
2159         defaultLocaleContent.clear();
2160         defaultLocaleContent.addAll(newDefaultContent.values());
2161         newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
2162             new TreeMap<String, String>(), errors);
2163         if (!errors.isEmpty()) {
2164             System.out.println("***New Errors: " + Joiner.on("\n").join(errors));
2165         }
2166     }
2167 
compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)2168     private static Set<String> compareMapsAndFixNew(String title,
2169         Map<String, String> oldContent,
2170         Map<String, String> newContent, String... allowedOverrideValues) {
2171         Map<String, String> allowedOverrideValuesTest = new HashMap<>();
2172         for (int i = 0; i < allowedOverrideValues.length; i += 2) {
2173             allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]);
2174         }
2175         Set<String> changes = new TreeSet<>();
2176         for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet())
2177             .addAll(oldContent.keySet()).get()) {
2178             String oldValue = oldContent.get(parent);
2179             String newValue = newContent.get(parent);
2180             String overrideValue = allowedOverrideValuesTest.get(parent);
2181             if (overrideValue != null) {
2182                 newContent.put(parent, overrideValue);
2183                 newValue = overrideValue;
2184             }
2185             if (CldrUtility.equals(oldValue, newValue)) {
2186                 continue;
2187             }
2188             String message;
2189             if (oldValue == null) {
2190                 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2191                     + ConvertLanguageData.getLanguageCodeAndName(newValue);
2192                 newContent.put(parent, newValue);
2193             } else if (newValue == null) {
2194                 if (SUPPRESS_CHANGES) {
2195                     message = "Suppressing removal of "
2196                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2197                         + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2198                     newContent.put(parent, oldValue);
2199                 } else {
2200                     message = "Removing "
2201                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2202                         + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2203                     newContent.remove(oldValue);
2204                 }
2205             } else {
2206                 if (SUPPRESS_CHANGES) {
2207                     message = "Suppressing change of "
2208                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2209                         + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
2210                         + ConvertLanguageData.getLanguageCodeAndName(newValue);
2211                     newContent.remove(newValue);
2212                     newContent.put(parent, oldValue);
2213                 } else {
2214                     message = "Changing "
2215                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2216                         + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
2217                         + ConvertLanguageData.getLanguageCodeAndName(newValue);
2218                     newContent.remove(oldValue);
2219                     newContent.put(parent, newValue);
2220                 }
2221             }
2222             changes.add(title + message);
2223         }
2224         return changes;
2225     }
2226 
2227     public static class LocaleStringComparator implements Comparator<String> {
2228         LanguageTagParser ltp0 = new LanguageTagParser();
2229         LanguageTagParser ltp1 = new LanguageTagParser();
2230 
2231         @Override
compare(String arg0, String arg1)2232         public int compare(String arg0, String arg1) {
2233             ltp0.set(arg0);
2234             ltp1.set(arg1);
2235             String s0 = ltp0.getLanguage();
2236             String s1 = ltp1.getLanguage();
2237             int result = s0.compareTo(s1);
2238             if (result != 0) {
2239                 return s0.equals("und") ? 1
2240                     : s1.equals("und") ? -1
2241                         : result;
2242             }
2243             s0 = ltp0.getScript();
2244             s1 = ltp1.getScript();
2245             result = s0.compareTo(s1);
2246             if (result != 0) {
2247                 return result;
2248             }
2249             s0 = ltp0.getRegion();
2250             s1 = ltp1.getRegion();
2251             result = s0.compareTo(s1);
2252             if (result != 0) {
2253                 return result;
2254             }
2255             return arg0.compareTo(arg1); // just in case
2256         }
2257 
2258     }
2259 }
2260