• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.PrintWriter;
7 import java.nio.file.Files;
8 import java.util.Arrays;
9 import java.util.BitSet;
10 import java.util.Collection;
11 import java.util.Comparator;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.LinkedHashSet;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 
22 import org.unicode.cldr.draft.FileUtilities;
23 import org.unicode.cldr.draft.ScriptMetadata;
24 import org.unicode.cldr.draft.ScriptMetadata.Info;
25 import org.unicode.cldr.util.Builder;
26 import org.unicode.cldr.util.CLDRConfig;
27 import org.unicode.cldr.util.CLDRFile;
28 import org.unicode.cldr.util.CLDRLocale;
29 import org.unicode.cldr.util.CLDRPaths;
30 import org.unicode.cldr.util.CldrUtility;
31 import org.unicode.cldr.util.Containment;
32 import org.unicode.cldr.util.Counter;
33 import org.unicode.cldr.util.Factory;
34 import org.unicode.cldr.util.Iso639Data;
35 import org.unicode.cldr.util.Iso639Data.Scope;
36 import org.unicode.cldr.util.LanguageTagParser;
37 import org.unicode.cldr.util.LocaleIDParser;
38 import org.unicode.cldr.util.Organization;
39 import org.unicode.cldr.util.PatternCache;
40 import org.unicode.cldr.util.SimpleFactory;
41 import org.unicode.cldr.util.StandardCodes;
42 import org.unicode.cldr.util.StandardCodes.LstrType;
43 import org.unicode.cldr.util.SupplementalDataInfo;
44 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
45 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
46 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
47 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
48 import org.unicode.cldr.util.Validity;
49 import org.unicode.cldr.util.Validity.Status;
50 
51 import com.google.common.base.Joiner;
52 import com.google.common.collect.ImmutableList;
53 import com.google.common.collect.ImmutableMap;
54 import com.google.common.collect.ImmutableSet;
55 import com.ibm.icu.impl.Relation;
56 import com.ibm.icu.impl.Row;
57 import com.ibm.icu.impl.Row.R2;
58 import com.ibm.icu.impl.Row.R3;
59 import com.ibm.icu.impl.Row.R4;
60 import com.ibm.icu.lang.UScript;
61 import com.ibm.icu.text.Collator;
62 import com.ibm.icu.text.NumberFormat;
63 import com.ibm.icu.text.UTF16;
64 import com.ibm.icu.text.UnicodeSet;
65 import com.ibm.icu.text.UnicodeSetIterator;
66 import com.ibm.icu.util.ULocale;
67 
68 /**
69  * Problems:
70  * "und_Hani", "zh_Hani"
71  * "und_Sinh", "si_Sinh"
72  *
73  * @author markdavis
74  *
75  */
76 public class GenerateMaximalLocales {
77 
78     private static final Map<String, Status> LANGUAGE_CODE_TO_STATUS = Validity.getInstance().getCodeToStatus(LstrType.language);
79 
80     private static final String TEMP_UNKNOWN_REGION = "XZ";
81 
82     private static final String DEBUG_ADD_KEY = "und_Latn_ZA";
83 
84     private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false);
85     private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false);
86     private static final boolean SHOW_CONTAINERS = false;
87 
88     private static final boolean SHOW_ALL_LANGUAGE_CODES = false;
89     private static final boolean SHOW_DETAILED = false;
90     private static final boolean SHOW_INCLUDED_EXCLUDED = false;
91     enum OutputStyle {
92         PLAINTEXT, C, C_ALT, XML
93     }
94 
95     private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML")
96         .toUpperCase());
97 
98     // set based on above
99     private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR
100         : "\t";
101     private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_";
102     // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT;
103 
104     private static final boolean tryDifferent = true;
105 
106     private static final File list[] = {
107         new File(CLDRPaths.MAIN_DIRECTORY),
108         new File(CLDRPaths.SEED_DIRECTORY),
109         new File(CLDRPaths.EXEMPLARS_DIRECTORY) };
110 
111     private static Factory factory = SimpleFactory.make(list, ".*");
112     private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory();
113     private static SupplementalDataInfo supplementalData = SupplementalDataInfo
114         .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);
115     private static StandardCodes standardCodes = StandardCodes.make();
116     private static CLDRFile english = factory.make("en", false);
117     static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
118     static {
119         for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) {
120             String region = locale.getCountry();
121             if (region == null || region.isEmpty() || Containment.isLeaf(region)) {
122                 continue;
123             }
cldrContainerToLanguages.put(region, locale.getLanguage())124             cldrContainerToLanguages.put(region, locale.getLanguage());
125         }
cldrContainerToLanguages.freeze()126         cldrContainerToLanguages.freeze();
127         System.out.println("Keep containers " + cldrContainerToLanguages);
128     }
129 
130     private static final List<String> KEEP_TARGETS = Arrays.asList(
131         "und_Arab_PK",
132         "und_Latn_ET",
133         "hi_Latn"
134     );
135     private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr");
136 
137     /**
138      * This is the simplest way to override, by supplying the max value.
139      * It gets a very low weight, so doesn't override any stronger value.
140      */
141     private static final String[] MAX_ADDITIONS = new String[] {
142         "bss_Latn_CM",
143         "gez_Ethi_ET",
144         "ken_Latn_CM",
145         "und_Arab_PK",
146         "wa_Latn_BE",
147 
148         "fub_Arab_CM",
149         "fuf_Latn_GN",
150         "kby_Arab_NE",
151         "kdh_Latn_TG",
152         "apd_Arab_TG",
153         "zlm_Latn_TG",
154 
155         "cr_Cans_CA",
156         "hif_Latn_FJ",
157         "gon_Telu_IN",
158         "lzz_Latn_TR",
159         "lif_Deva_NP",
160         "unx_Beng_IN",
161         "unr_Beng_IN",
162         "ttt_Latn_AZ",
163         "pnt_Grek_GR",
164         "tly_Latn_AZ",
165         "tkr_Latn_AZ",
166         "bsq_Bass_LR",
167         "ccp_Cakm_BD",
168         "blt_Tavt_VN",
169         "rhg_Arab_MM",
170         "rhg_Rohg_MM",
171         "clc_Latn_CA",
172         "crg_Latn_CA",
173         "hur_Latn_CA",
174         "kwk_Latn_CA",
175         "lil_Latn_CA",
176         "ojs_Cans_CA",
177         "oka_Latn_CA",
178         "pqm_Latn_CA",
179 
180         "hi_Latn_IN",
181         "no_Latn_NO",
182         "und_Cpmn_CY",
183 
184         "hnj_Hmnp_US",
185         "rhg_Arab_MM"
186     };
187 
188     /**
189      * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS.
190      * However, if you add, add both the language and language+script mappings.
191      */
192     // Many of the overrides below can be removed once the language/pop/country data is updated.
193     private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] {
194         { "cic", "cic_Latn_US" },
195         { "cic_Latn", "cic_Latn_US" },
196         { "eo", "eo_Latn_001" },
197         { "eo_Latn", "eo_Latn_001" },
198         { "es", "es_Latn_ES" },
199         { "es_Latn", "es_Latn_ES" },
200         { "ff_BF", "ff_Latn_BF" },
201         { "ff_GM", "ff_Latn_GM" },
202         { "ff_GH", "ff_Latn_GH" },
203         { "ff_GW", "ff_Latn_GW" },
204         { "ff_LR", "ff_Latn_LR" },
205         { "ff_NE", "ff_Latn_NE" },
206         { "ff_NG", "ff_Latn_NG" },
207         { "ff_SL", "ff_Latn_SL" },
208         { "ff_Adlm", "ff_Adlm_GN" },
209         { "ia", "ia_Latn_001" },
210         { "ia_Latn", "ia_Latn_001" },
211         { "io", "io_Latn_001" },
212         { "io_Latn", "io_Latn_001" },
213         { "jbo", "jbo_Latn_001" },
214         { "jbo_Latn", "jbo_Latn_001" },
215         { "ku_Arab", "ku_Arab_IQ" },
216         { "lrc", "lrc_Arab_IR" },
217         { "lrc_Arab", "lrc_Arab_IR" },
218         { "man", "man_Latn_GM" },
219         { "man_Latn", "man_Latn_GM" },
220         { "mas", "mas_Latn_KE" },
221         { "mas_Latn", "mas_Latn_KE" },
222         { "mn", "mn_Cyrl_MN" },
223         { "mn_Cyrl", "mn_Cyrl_MN" },
224         { "mro", "mro_Mroo_BD" },
225         { "mro_BD", "mro_Mroo_BD" },
226         { "ms_Arab", "ms_Arab_MY" },
227         { "pap", "pap_Latn_AW" },
228         { "pap_Latn", "pap_Latn_AW" },
229         { "prg", "prg_Latn_001" },
230         { "prg_Latn", "prg_Latn_001" },
231         { "rif", "rif_Tfng_MA" },
232         { "rif_Latn", "rif_Latn_MA" },
233         { "rif_Tfng", "rif_Tfng_MA" },
234         { "rif_MA", "rif_Tfng_MA" },
235         { "shi", "shi_Tfng_MA" },
236         { "shi_Tfng", "shi_Tfng_MA" },
237         { "shi_MA", "shi_Tfng_MA" },
238         { "sr_Latn", "sr_Latn_RS" },
239         { "ss", "ss_Latn_ZA" },
240         { "ss_Latn", "ss_Latn_ZA" },
241         { "swc", "swc_Latn_CD" },
242         { "ti", "ti_Ethi_ET" },
243         { "ti_Ethi", "ti_Ethi_ET" },
244         { "und", "en_Latn_US" },
245         { "und_Adlm", "ff_Adlm_GN" },
246         { "und_Adlm_GN", "ff_Adlm_GN" },
247         { "und_Arab", "ar_Arab_EG" },
248         { "und_Arab_PK", "ur_Arab_PK" },
249         { "und_Bopo", "zh_Bopo_TW" },
250         { "und_Deva_FJ", "hif_Deva_FJ" },
251         { "und_EZ", "de_Latn_EZ" },
252         { "und_Hani", "zh_Hani_CN" },
253         { "und_Hani_CN", "zh_Hani_CN" },
254         { "und_Kana", "ja_Kana_JP" },
255         { "und_Kana_JP", "ja_Kana_JP" },
256         { "und_Latn", "en_Latn_US" },
257         { "und_Latn_ET", "en_Latn_ET" },
258         { "und_Latn_NE", "ha_Latn_NE" },
259         { "und_Latn_PH", "fil_Latn_PH" },
260         { "und_ML", "bm_Latn_ML" },
261         { "und_Latn_ML", "bm_Latn_ML" },
262         { "und_MU", "mfe_Latn_MU" },
263         { "und_NE", "ha_Latn_NE" },
264         { "und_PH", "fil_Latn_PH" },
265         { "und_PK", "ur_Arab_PK" },
266         { "und_SO", "so_Latn_SO" },
267         { "und_SS", "en_Latn_SS" },
268         { "und_TK", "tkl_Latn_TK" },
269         { "und_UN", "en_Latn_UN" },
270         { "und_005", "pt_Latn_BR" },
271         { "vo", "vo_Latn_001" },
272         { "vo_Latn", "vo_Latn_001" },
273         { "yi", "yi_Hebr_001" },
274         { "yi_Hebr", "yi_Hebr_001" },
275         { "yue", "yue_Hant_HK" },
276         { "yue_Hant", "yue_Hant_HK" },
277         { "yue_Hans", "yue_Hans_CN" },
278         { "yue_CN", "yue_Hans_CN" },
279         { "zh_Hani", "zh_Hani_CN" },
280 
281         { "zh_Bopo", "zh_Bopo_TW" },
282         { "ccp", "ccp_Cakm_BD" },
283         { "ccp_Cakm", "ccp_Cakm_BD" },
284         { "und_Cakm", "ccp_Cakm_BD" },
285         { "cu_Glag", "cu_Glag_BG" },
286         { "sd_Khoj", "sd_Khoj_IN" },
287         { "lif_Limb", "lif_Limb_IN" },
288         { "grc_Linb", "grc_Linb_GR" },
289         { "arc_Nbat", "arc_Nbat_JO" },
290         { "arc_Palm", "arc_Palm_SY" },
291         { "pal_Phlp", "pal_Phlp_CN" },
292         { "en_Shaw", "en_Shaw_GB" },
293         { "sd_Sind", "sd_Sind_IN" },
294         { "und_Brai", "fr_Brai_FR" }, // hack
295         { "und_Hanb", "zh_Hanb_TW" }, // Special script code
296         { "zh_Hanb", "zh_Hanb_TW" }, // Special script code
297         { "und_Jamo", "ko_Jamo_KR" }, // Special script code
298 
299         //{"und_Cyrl_PL", "be_Cyrl_PL"},
300 
301 //        {"cr", "cr_Cans_CA"},
302 //        {"hif", "hif_Latn_FJ"},
303 //        {"gon", "gon_Telu_IN"},
304 //        {"lzz", "lzz_Latn_TR"},
305 //        {"lif", "lif_Deva_NP"},
306 //        {"unx", "unx_Beng_IN"},
307 //        {"unr", "unr_Beng_IN"},
308 //        {"ttt", "ttt_Latn_AZ"},
309 //        {"pnt", "pnt_Grek_GR"},
310 //        {"tly", "tly_Latn_AZ"},
311 //        {"tkr", "tkr_Latn_AZ"},
312 //        {"bsq", "bsq_Bass_LR"},
313 //        {"ccp", "ccp_Cakm_BD"},
314 //        {"blt", "blt_Tavt_VN"},
315 //        { "mis_Medf", "mis_Medf_NG" },
316 
317         { "ku_Yezi", "ku_Yezi_GE" },
318         { "und_EU", "en_Latn_IE" },
319     });
320 
321     /**
322      * The following supplements the suppress-script. It overrides info from exemplars and the locale info.
323      */
324     private static String[][] SpecialScripts = {
325         { "zh", "Hans" }, // Hans (not Hani)
326         { "yue", "Hant" }, // Hans (not Hani)
327         { "chk", "Latn" }, // Chuukese (Micronesia)
328         { "fil", "Latn" }, // Filipino (Philippines)"
329         { "ko", "Kore" }, // Korean (North Korea)
330         { "ko_KR", "Kore" }, // Korean (North Korea)
331         { "pap", "Latn" }, // Papiamento (Netherlands Antilles)
332         { "pau", "Latn" }, // Palauan (Palau)
333         { "su", "Latn" }, // Sundanese (Indonesia)
334         { "tet", "Latn" }, // Tetum (East Timor)
335         { "tk", "Latn" }, // Turkmen (Turkmenistan)
336         { "ty", "Latn" }, // Tahitian (French Polynesia)
337         { "ja", "Jpan" }, // Special script for japan
338         { "und", "Latn" }, // Ultimate fallback
339     };
340 
341     private static Map<String, String> localeToScriptCache = new TreeMap<>();
342     static {
343         for (String language : standardCodes.getAvailableCodes("language")) {
344             Map<String, String> info = standardCodes.getLangData("language", language);
345             String script = info.get("Suppress-Script");
346             if (script != null) {
localeToScriptCache.put(language, script)347                 localeToScriptCache.put(language, script);
348             }
349         }
350         for (String[] pair : SpecialScripts) {
localeToScriptCache.put(pair[0], pair[1])351             localeToScriptCache.put(pair[0], pair[1]);
352         }
353     }
354 
355     private static Map<String, String> FALLBACK_SCRIPTS;
356     static {
357         LanguageTagParser additionLtp = new LanguageTagParser();
358         Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>();
359         for (String addition : MAX_ADDITIONS) {
360             additionLtp.set(addition);
361             String lan = additionLtp.getLanguage();
_FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())362             _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript());
363         }
364         FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS);
365     }
366 
367     private static int errorCount;
368 
main(String[] args)369     public static void main(String[] args) throws IOException {
370 
371         printDefaultLanguagesAndScripts();
372 
373         Map<String, String> toMaximized = new TreeMap<>();
374 
375         tryDifferentAlgorithm(toMaximized);
376 
377         minimize(toMaximized);
378 
379         // HACK TEMP_UNKNOWN_REGION
380         // this is to get around the removal of items with ZZ in minimize.
381         // probably cleaner way to do it, but this provides control over just those we want to retain.
382         Set<String> toRemove = new TreeSet<>();
383         Map<String, String> toFix = new TreeMap<>();
384         for (Entry<String, String> entry : toMaximized.entrySet()) {
385             String key = entry.getKey();
386             String value = entry.getValue();
387             if (key.contains(TEMP_UNKNOWN_REGION)) {
388                 toRemove.add(key);
389             } else if (value.contains(TEMP_UNKNOWN_REGION)) {
390                 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION));
391             }
392         }
393         for (String key : toRemove) {
394             toMaximized.remove(key);
395         }
396         toMaximized.putAll(toFix);
397 
398         Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags();
399         Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab",
400             "ms_Arab_ID");
401         System.out.println(Joiner.on("\n").join(changes));
402 
403         if (OUTPUT_STYLE == OutputStyle.C_ALT) {
404             doAlt(toMaximized);
405         }
406 
407         if (SHOW_ADD)
408             System.out
409                 .println("/*"
410                     + CldrUtility.LINE_SEPARATOR
411                     + " To Maximize:"
412                     +
413                     CldrUtility.LINE_SEPARATOR
414                     + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing."
415                     +
416                     CldrUtility.LINE_SEPARATOR
417                     + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'."
418                     +
419                     CldrUtility.LINE_SEPARATOR
420                     + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions"
421                     +
422                     CldrUtility.LINE_SEPARATOR
423                     + " Try each of the following in order (where the field exists)"
424                     +
425                     CldrUtility.LINE_SEPARATOR
426                     + "   Lookup language-script-region. If in the table, return the result + variants"
427                     +
428                     CldrUtility.LINE_SEPARATOR
429                     + "   Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants"
430                     +
431                     CldrUtility.LINE_SEPARATOR
432                     + "   Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants"
433                     +
434                     CldrUtility.LINE_SEPARATOR
435                     + "   Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants"
436                     +
437                     CldrUtility.LINE_SEPARATOR
438                     +
439                     CldrUtility.LINE_SEPARATOR
440                     + " Example: Input is zh-ZZZZ-SG."
441                     +
442                     CldrUtility.LINE_SEPARATOR
443                     + " Normalize to zh-SG. Lookup in table. No match."
444                     +
445                     CldrUtility.LINE_SEPARATOR
446                     + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG."
447                     +
448                     CldrUtility.LINE_SEPARATOR
449                     +
450                     CldrUtility.LINE_SEPARATOR
451                     + " To Minimize:"
452                     +
453                     CldrUtility.LINE_SEPARATOR
454                     + " First get max = maximize(input)."
455                     +
456                     CldrUtility.LINE_SEPARATOR
457                     + " Then for trial in {language, language-region, language-script}"
458                     +
459                     CldrUtility.LINE_SEPARATOR
460                     + "     If maximize(trial) == max, then return trial."
461                     +
462                     CldrUtility.LINE_SEPARATOR
463                     + " If you don't get a match, return max."
464                     +
465                     CldrUtility.LINE_SEPARATOR
466                     +
467                     CldrUtility.LINE_SEPARATOR
468                     + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW."
469                     +
470                     CldrUtility.LINE_SEPARATOR
471                     + " zh => zh-Hans-CN. No match, so continue."
472                     +
473                     CldrUtility.LINE_SEPARATOR
474                     + " zh-TW => zh-Hans-TW. Match, so return zh-TW."
475                     +
476                     CldrUtility.LINE_SEPARATOR
477                     +
478                     CldrUtility.LINE_SEPARATOR
479                     + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language."
480                     +
481                     CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() +
482                     CldrUtility.LINE_SEPARATOR + "*/");
483 
484         final File newLikelySubtags = printLikelySubtags(toMaximized);
485 
486         printDefaultContent(toMaximized);
487 
488         // Do this here so the two "Copying…" messages show up together.
489         if (OUTPUT_STYLE == OutputStyle.XML) {
490             final File oldLikelySubtags = CLDRConfig.getInstance().getEnglish().getSupplementalFile("likelySubtags.xml");
491             System.out.println("Copying " + newLikelySubtags + " to " + oldLikelySubtags);
492             oldLikelySubtags.delete();
493             Files.copy(newLikelySubtags.toPath(), oldLikelySubtags.toPath());
494         }
495 
496         System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR);
497 
498         System.exit(errorCount > 0 ? 1 : 0);
499     }
500 
501     static class RowData implements Comparable<RowData> {
502         OfficialStatus os;
503         String name;
504         Long pop;
505 
RowData(OfficialStatus os, String name, Long pop)506         public RowData(OfficialStatus os, String name, Long pop) {
507             this.os = os;
508             this.name = name;
509             this.pop = pop;
510         }
511 
getStatus()512         public OfficialStatus getStatus() {
513             // TODO Auto-generated method stub
514             return os;
515         }
516 
getName()517         public CharSequence getName() {
518             // TODO Auto-generated method stub
519             return name;
520         }
521 
getLiteratePopulation()522         public Long getLiteratePopulation() {
523             // TODO Auto-generated method stub
524             return pop;
525         }
526 
527         @Override
compareTo(RowData o)528         public int compareTo(RowData o) {
529             // TODO Auto-generated method stub
530             int result = os.compareTo(o.os);
531             if (result != 0) return -result;
532             long result2 = pop - o.pop;
533             if (result2 != 0) return result2 < 0 ? 1 : -1;
534             return name.compareTo(o.name);
535         }
536 
537         @Override
equals(Object o)538         public boolean equals(Object o) {
539             return 0 == compareTo((RowData) o);
540         }
541 
542         @Override
hashCode()543         public int hashCode() {
544             throw new UnsupportedOperationException();
545         }
546     }
547 
printDefaultLanguagesAndScripts()548     private static void printDefaultLanguagesAndScripts() {
549 
550         final int minTotalPopulation = 10000000;
551         final int minTerritoryPopulation = 1000000;
552         final double minTerritoryPercent = 1.0 / 3;
553         Map<String, Set<RowData>> languageToReason = new TreeMap<>();
554         Counter<String> languageToLiteratePopulation = new Counter<>();
555         NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH);
556         nf.setGroupingUsed(true);
557         LanguageTagParser ltp = new LanguageTagParser();
558         LikelySubtags likelySubtags = new LikelySubtags();
559         /*
560          * A. X is a qualified language**, and at least one of the following is true:
561          *
562          * 1. X is has official status* in any country
563          * 2. X exceeds a threshold population† of literate users worldwide: 1M
564          * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†.
565          *
566          * B. X is an exception explicitly approved by the committee or X has minimal
567          * language coverage‡ in CLDR itself.
568          * C. The language is in the CLDR-target locales
569          */
570         OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official;
571         Map<String, String> languages = new TreeMap<>();
572         for (String language : standardCodes.getAvailableCodes("language")) {
573             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
574             String result = english.getStringValue(path);
575             if (result != null) {
576                 languages.put(language, result);
577             }
578         }
579 
580         if (SHOW_ALL_LANGUAGE_CODES) {
581             for (String language : languages.keySet()) {
582                 System.out.println(language + "\t" + languages.get(language));
583             }
584         } else {
585             System.out.println("- GenerateMaximalLocales.java: SHOW_ALL_LANGUAGE_CODES=true to show all language codes");
586         }
587 
588         // also CLDR-target locales
589         final Set<String> CLDRMainLanguages = new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr));
590 
591         for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
592             PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory);
593             double territoryPopulation = territoryPop.getLiteratePopulation();
594             for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) {
595                 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript,
596                     territory);
597                 ltp.set(languageScript);
598                 String language = ltp.getLanguage();
599 //                if (ltp.getScript().isEmpty()) {
600 //                    String max = likelySubtags.maximize(languageScript);
601 //                    if (max != null) {
602 //                        ltp.set(max).setRegion("");
603 //                        languageScript = ltp.toString();
604 //                    }
605 //                }
606                 boolean add = false;
607                 // #1
608                 OfficialStatus status = popData.getOfficialStatus();
609                 if (status.compareTo(minimalStatus) >= 0) {
610                     add = true;
611                 }
612                 long literatePopulation = getWritingPopulation(popData);
613                 // #2
614                 languageToLiteratePopulation.add(language, literatePopulation);
615                 // #3
616                 if (literatePopulation > minTerritoryPopulation
617                     && literatePopulation > minTerritoryPercent * territoryPopulation) {
618                     add = true;
619                 }
620                 if (add == false && CLDRMainLanguages.contains(language)) {
621                     add = true;
622                 }
623                 if (add) {
624                     add(languageToReason, language, territory, status, literatePopulation);
625                     // Add the containing regions
626                     for (String container : Containment.leafToContainer(territory)) {
627                         add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation);
628                     }
629                 }
630             }
631         }
632         // #2, now that we have the data
633         for (String language : languageToLiteratePopulation.keySet()) {
634             long totalPop = languageToLiteratePopulation.getCount(language);
635             if (totalPop > minTotalPopulation) {
636                 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop);
637             }
638         }
639 
640         // Specials
641         add(languageToReason, "und", "001", OfficialStatus.unknown, 0);
642 
643         // for (String language : Iso639Data.getAvailable()) {
644         // Scope scope = Iso639Data.getScope(language);
645         // Type type = Iso639Data.getType(language);
646         // if (scope == Scope.Special) {
647         // add(languageToReason, language, "001", OfficialStatus.unknown, -1);
648         // }
649         // }
650         // print them
651 
652         System.out.println("Detailed - Including:\t" + languageToReason.size());
653 
654         if (!SHOW_DETAILED) {
655             System.out.println("- GenerateMaximalLocales.java: SHOW_DETAILED=true to show more details");
656         } else {
657             for (String language : languageToReason.keySet()) {
658                 Set<RowData> reasons = languageToReason.get(language);
659 
660                 RowData lastReason = reasons.iterator().next();
661 
662                 System.out.append(language)
663                     .append("\t")
664                     .append(english.getName(language))
665                     .append("\t")
666                     .append(lastReason.getStatus().toShortString())
667                     .append("\t")
668                     .append(nf.format(languageToLiteratePopulation.getCount(language)));
669                 for (RowData reason : reasons) {
670                     String status = reason.getStatus().toShortString();
671                     System.out.append("\t")
672                         .append(status)
673                         .append("-")
674                         .append(reason.getName())
675                         .append("-")
676                         .append(nf.format(reason.getLiteratePopulation()));
677                 }
678                 System.out.append("\n");
679             }
680         }
681 
682         // now list them
683 
684         Set<String> others = new TreeSet<>();
685         others.addAll(standardCodes.getGoodAvailableCodes("language"));
686         others.removeAll(languageToReason.keySet());
687         System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size());
688         if (SHOW_INCLUDED_EXCLUDED) {
689             showLanguages(languageToReason.keySet(), languageToReason);
690         }
691         System.out.println("\nExcluded Languages:\t" + others.size());
692         if (SHOW_INCLUDED_EXCLUDED) {
693             showLanguages(others, languageToReason);
694         } else {
695             System.out.println(" - GenerateMaximalLocales.java: set SHOW_INCLUDED_EXCLUDED=true to show reason details");
696         }
697     }
698 
getWritingPopulation(PopulationData popData)699     private static long getWritingPopulation(PopulationData popData) {
700         final double writingPopulation = popData.getWritingPopulation();
701         if (!Double.isNaN(writingPopulation)) {
702             return (long) writingPopulation;
703         }
704         return (long) popData.getLiteratePopulation();
705     }
706 
showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)707     private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) {
708         Set<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH));
709         for (String language : others) {
710             sorted.add(getLanguageName(language, languageToReason));
711         }
712         char last = 0;
713         for (String language : sorted) {
714             final char curr = language.charAt(0);
715             if (last != curr) {
716                 System.out.println();
717             } else if (last != '\u0000') {
718                 System.out.print(", ");
719             }
720             System.out.print(language);
721             last = curr;
722         }
723         System.out.println();
724     }
725 
getLanguageName(String language, Map<String, Set<RowData>> languageToReason)726     private static String getLanguageName(String language,
727         Map<String, Set<RowData>> languageToReason) {
728         OfficialStatus best = OfficialStatus.unknown;
729         Set<RowData> reasons = languageToReason.get(language);
730         if (reasons != null) {
731             for (RowData reason : reasons) {
732                 final OfficialStatus currentStatus = reason.getStatus();
733                 if (best.compareTo(currentStatus) < 0) {
734                     best = currentStatus;
735                 }
736             }
737         }
738         String status = best.toShortString();
739         Scope scope = Iso639Data.getScope(language);
740         if (scope == Scope.Special) {
741             status = "S";
742         }
743         String languageFormatted = english.getName(language) + " [" + language + "]-" + status;
744         return languageFormatted;
745     }
746 
add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)747     private static void add(Map<String, Set<RowData>> languageToReason, String language,
748         String territoryRaw, OfficialStatus status, long population) {
749         String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]";
750         Set<RowData> set = languageToReason.get(language);
751         if (set == null) {
752             languageToReason.put(language, set = new TreeSet<>());
753         }
754         set.add(new RowData(status, territory, population));
755     }
756 
757     /**
758      * In computing the defaultContents, no and nb require special handling.
759      */
760     static final Map<String, String> SPECIAL_CHILD_TO_PARENT = ImmutableMap.of("nb", "no", "nb_NO", "nb");
761 
762     /*
763      * Compute the defaultContent values for supplemental data.
764      * It uses the maximization data and the simpleParent (truncation).
765      * We can't use the normal "getParent" because that messes up the logic
766      * used to handle inconsistencies in scripts in CLDR.<br>
767      * That is, there are three situations: <ul>
768      * <li>all children have explicit scripts; </li>
769      * <li>no children have scripts; and </li>
770      * <li>some do and some don't</li></ul>
771      */
772 
printDefaultContent(Map<String, String> toMaximized)773     private static void printDefaultContent(Map<String, String> toMaximized) throws IOException {
774 
775         Set<String> defaultLocaleContent = new TreeSet<>();
776 
777         // go through all the cldr locales, and add default contents
778         // now computed from toMaximized
779         Set<String> available = factory.getAvailable();
780         Relation<String, String> toSimpleChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
781         LanguageTagParser ltp = new LanguageTagParser();
782 
783         // System.out.println(maximize("az_Latn_AZ", toMaximized));
784         Set<String> hasSimpleChildWithScript = new TreeSet<>();
785 
786         // first get a mapping to children
787         for (String locale : available) {
788             if (locale.equals("root")) {
789                 continue;
790             }
791             if (ltp.set(locale).getVariants().size() != 0) {
792                 continue;
793             }
794             String parent = SPECIAL_CHILD_TO_PARENT.get(locale);
795             if (parent == null) {
796                 parent = LocaleIDParser.getSimpleParent(locale); // we can't use the regular getParent (see above)
797             }
798 
799             if (ltp.getScript().length() != 0) {
800                 hasSimpleChildWithScript.add(parent);
801             }
802             if (parent.equals("root")) {
803                 continue;
804             }
805             toSimpleChildren.put(parent, locale);
806         }
807 
808         // Suppress script for locales for which we only have one locale in common/main. See ticket #7834.
809         Set<String> suppressScriptLocales = new HashSet<>(Arrays.asList(
810             "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN",
811             "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE",
812             "blt_VN",
813             "hi_IN",
814             "nv_US",
815             "doi_IN"
816             ));
817 
818         // if any have a script, then throw out any that don't have a script (unless they're specifically included.)
819         Set<String> toRemove = new TreeSet<>();
820         for (String locale : hasSimpleChildWithScript) {
821             toRemove.clear();
822             Set<String> children = toSimpleChildren.getAll(locale);
823             for (String child : children) {
824                 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) {
825                     toRemove.add(child);
826                 }
827             }
828             if (toRemove.size() != 0) {
829                 System.out.println("\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children);
830                 toSimpleChildren.removeAll(locale, toRemove);
831             }
832         }
833 
834         // we add a child as a default locale if it has the same maximization
835         main: for (String locale : toSimpleChildren.keySet()) {
836             String maximized = maximize(locale, toMaximized);
837             if (maximized == null) {
838                 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale);
839                 continue;
840             }
841             Set<String> children = toSimpleChildren.getAll(locale);
842             Map<String, String> debugStuff = new TreeMap<>();
843             for (String child : children) {
844                 String maximizedChild = maximize(child, toMaximized);
845                 if (maximized.equals(maximizedChild)) {
846                     defaultLocaleContent.add(child);
847                     continue main;
848                 }
849                 debugStuff.put(child, maximizedChild);
850             }
851             if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized
852                 + "\tin\t" + debugStuff);
853         }
854 
855         for (String specialChild : SPECIAL_CHILD_TO_PARENT.keySet()) {
856             defaultLocaleContent.add(specialChild);
857         }
858         defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. (old sandbox)
859         defaultLocaleContent.remove("mul_ZZ"); // mul_ZZ isn't ever a real locale.
860 
861         showDefaultContentDifferencesAndFix(defaultLocaleContent);
862 
863         final File genSuppDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental");
864         final File genSuppMetadataFile = new File(genSuppDir, "supplementalMetadata.xml");
865         final File oldSuppMetadataFile = new File(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml");
866 
867         try (
868             PrintWriter genFile = FileUtilities.openUTF8Writer(genSuppMetadataFile);
869             BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSuppMetadataFile);) {
870             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), genFile, false);
871 
872             String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t";
873             String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep,
874                 PatternCache.get("(\\S)\\S*").matcher(""), 80);
875 
876             genFile.println("\t\t<defaultContent locales=\"" + broken + "\"");
877             genFile.println("\t\t/>");
878 
879             // genFile.println("</supplementalData>");
880             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching >
881             CldrUtility.copyUpTo(oldFile, null, genFile, true); // copy the rest
882         }
883 
884         // Move it into place
885         System.out.println("Copying generated " + genSuppMetadataFile + " to " + oldSuppMetadataFile);
886         oldSuppMetadataFile.delete();
887         Files.copy(genSuppMetadataFile.toPath(), oldSuppMetadataFile.toPath());
888     }
889 
890     private static class MaxData {
891         Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
892         Map<String, Counter<String>> languagesToScripts = new TreeMap<>();
893         Map<String, Counter<String>> languagesToRegions = new TreeMap<>();
894 
895         Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
896         Map<String, Counter<String>> scriptsToLanguages = new TreeMap<>();
897         Map<String, Counter<String>> scriptsToRegions = new TreeMap<>();
898 
899         Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
900         Map<String, Counter<String>> regionsToLanguages = new TreeMap<>();
901         Map<String, Counter<String>> regionsToScripts = new TreeMap<>();
902 
903         Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<>();
904         Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of(
905             new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class);
906 
907         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of(
908             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
909             TreeSet.class);
910         Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of(
911             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
912             TreeSet.class);
913         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of(
914             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
915             TreeSet.class);
916 
917         /**
918          * Add population information. "order" is the negative of the population (makes the first be the highest).
919          * @param language
920          * @param script
921          * @param region
922          * @param order
923          */
add(String language, String script, String region, Double order)924         void add(String language, String script, String region, Double order) {
925             if (SHOW_ADD && language.equals("mis")) {
926                 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order);
927             }
928             languages.put(language, Row.of(order, script, region));
929             // addCounter(languagesToScripts, language, script, order);
930             // addCounter(languagesToRegions, language, region, order);
931 
932             scripts.put(script, Row.of(order, language, region));
933             // addCounter(scriptsToLanguages, script, language, order);
934             // addCounter(scriptsToRegions, script, region, order);
935 
936             regions.put(region, Row.of(order, language, script));
937             // addCounter(regionsToLanguages, region, language, order);
938             // addCounter(regionsToScripts, region, script, order);
939 
940             languageScripts.put(Row.of(language, script), Row.of(order, region));
941             scriptRegions.put(Row.of(script, region), Row.of(order, language));
942             languageRegions.put(Row.of(language, region), Row.of(order, script));
943 
944             Set<String> containerSet = Containment.leafToContainer(region);
945             if (containerSet != null) {
946                 for (String container : containerSet) {
947 
948                     containersToLangRegion.put(container, Row.of(order, language, script, region));
949                     Counter<R2<String, String>> data = containersToLanguage.get(container);
950                     if (data == null) {
951                         containersToLanguage.put(container, data = new Counter<>());
952                     }
953                     data.add(Row.of(language, script), (long) (double) order);
954 
955                 }
956             }
957 
958             if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order);
959         }
960         // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) {
961         // Counter<String> counter = map.get(key);
962         // if (counter == null) {
963         // map.put(key, counter = new Counter<String>());
964         // }
965         // counter.add(key2, count.longValue());
966         // }
967     }
968 
969     private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000;
970     private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20;
971     private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000;
972     private static final double UNOFFICIAL_SCALE_DOWN = 0.2;
973 
974     private static NumberFormat percent = NumberFormat.getPercentInstance();
975     private static NumberFormat number = NumberFormat.getIntegerInstance();
976 
tryDifferentAlgorithm(Map<String, String> toMaximized)977     private static void tryDifferentAlgorithm(Map<String, String> toMaximized) {
978         // we are going to try a different approach.
979         // first gather counts for maximized values
980         // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap();
981         MaxData maxData = new MaxData();
982         Set<String> cldrLocales = factory.getAvailable();
983         Set<String> otherTerritories = new TreeSet<>(standardCodes.getGoodAvailableCodes("territory"));
984 
985         // process all the information to get the top values for each triple.
986         // each of the combinations of 1 or 2 components gets to be a key.
987         for (String region : supplementalData.getTerritoriesWithPopulationData()) {
988             otherTerritories.remove(region);
989             PopulationData regionData = supplementalData.getPopulationDataForTerritory(region);
990             final double literateTerritoryPopulation = regionData.getLiteratePopulation();
991             // we need any unofficial language to meet a certain absolute size requirement and proportion size
992             // requirement.
993             // so the bar is x percent of the population, reset up to y absolute size.
994             double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION;
995             if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) {
996                 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE;
997             }
998 
999             for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) {
1000                 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region);
1001                 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation();
1002                 double order = -literatePopulation; // negative so we get the inverse order
1003 
1004                 if (data.getOfficialStatus() == OfficialStatus.unknown) {
1005                     final String locale = writtenLanguage + "_" + region;
1006                     if (literatePopulation >= minimalLiteratePopulation) {
1007                         // ok, skip
1008                     } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) {
1009                         // ok, skip
1010                     } else {
1011                         // if (SHOW_ADD)
1012                         // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t"
1013                         // + english.getName(locale)
1014                         // + "\t-- too small:\t" + number.format(literatePopulation));
1015                         // continue;
1016                     }
1017                     order *= UNOFFICIAL_SCALE_DOWN;
1018                     if (SHOW_ADD)
1019                         System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t"
1020                             + english.getName(locale)
1021                             + "\t" + number.format(literatePopulation)
1022                             + "\t" + percent.format(literatePopulation / literateTerritoryPopulation)
1023                             + (cldrLocales.contains(locale) ? "\tin-CLDR" : ""));
1024                 }
1025                 String script;
1026                 String language = writtenLanguage;
1027                 final int pos = writtenLanguage.indexOf('_');
1028                 if (pos > 0) {
1029                     language = writtenLanguage.substring(0, pos);
1030                     script = writtenLanguage.substring(pos + 1);
1031                 } else {
1032                     script = getScriptForLocale2(language);
1033                 }
1034                 maxData.add(language, script, region, order);
1035             }
1036         }
1037 
1038         LanguageTagParser additionLtp = new LanguageTagParser();
1039 
1040         for (String addition : MAX_ADDITIONS) {
1041             additionLtp.set(addition);
1042             String lan = additionLtp.getLanguage();
1043             Set<R3<Double, String, String>> key = maxData.languages.get(lan);
1044             if (key == null) {
1045                 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0);
1046             } else {
1047                 int debug = 0;
1048             }
1049         }
1050 
1051         for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) {
1052             String language = entry.getKey();
1053             final Collection<String> values = entry.getValue();
1054             if (values.size() != 1) {
1055                 continue; // skip, no either way
1056             }
1057             Set<R3<Double, String, String>> old = maxData.languages.get(language);
1058             if (!maxData.languages.containsKey(language)) {
1059                 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0);
1060             }
1061         }
1062 
1063         // add others, with English default
1064         for (String region : otherTerritories) {
1065             if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS
1066             maxData.add("en", "Latn", region, 1.0);
1067         }
1068 
1069         // get a reverse mapping, so that we can add the aliases
1070 
1071         Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo()
1072             .get("language");
1073         for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) {
1074             String reason = str.getValue().get1();
1075             if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) {
1076                 continue;
1077             }
1078             List<String> replacements = str.getValue().get0();
1079             if (replacements == null) {
1080                 continue;
1081             }
1082             String goodLanguage = replacements.get(0);
1083 
1084             String badLanguage = str.getKey();
1085             if (badLanguage.contains("_")) {
1086                 continue;
1087             }
1088             if (deprecatedISONotInLST.contains(badLanguage)) {
1089                 continue;
1090             }
1091             Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage);
1092             if (goodLanguageData == null) {
1093                 continue;
1094             }
1095             R3<Double, String, String> value = goodLanguageData.iterator().next();
1096             final String script = value.get1();
1097             final String region = value.get2();
1098             maxData.add(badLanguage, script, region, 1.0);
1099             System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason);
1100         }
1101 
1102         // now, get the best for each one
1103         for (String language : maxData.languages.keySet()) {
1104             R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next();
1105             final Comparable<String> script = value.get1();
1106             final Comparable<String> region = value.get2();
1107             add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", LocaleOverride.REPLACE_EXISTING,
1108                 SHOW_ADD);
1109         }
1110         for (String language : maxData.languagesToScripts.keySet()) {
1111             String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next();
1112             add(language, language + "_" + script, toMaximized, "L->S", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1113         }
1114         for (String language : maxData.languagesToRegions.keySet()) {
1115             String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next();
1116             add(language, language + "_" + region, toMaximized, "L->R", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1117         }
1118 
1119         for (String script : maxData.scripts.keySet()) {
1120             R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next();
1121             final Comparable<String> language = value.get1();
1122             final Comparable<String> region = value.get2();
1123             add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR",
1124                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1125         }
1126         for (String script : maxData.scriptsToLanguages.keySet()) {
1127             String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next();
1128             add("und_" + script, language + "_" + script, toMaximized, "S->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1129         }
1130         for (String script : maxData.scriptsToRegions.keySet()) {
1131             String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next();
1132             add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", LocaleOverride.REPLACE_EXISTING,
1133                 SHOW_ADD);
1134         }
1135 
1136         for (String region : maxData.regions.keySet()) {
1137             R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next();
1138             final Comparable<String> language = value.get1();
1139             final Comparable<String> script = value.get2();
1140             add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS",
1141                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1142         }
1143         for (String region : maxData.regionsToLanguages.keySet()) {
1144             String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next();
1145             add("und_" + region, language + "_" + region, toMaximized, "R->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1146         }
1147         for (String region : maxData.regionsToScripts.keySet()) {
1148             String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next();
1149             add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", LocaleOverride.REPLACE_EXISTING,
1150                 SHOW_ADD);
1151         }
1152 
1153         for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) {
1154             String region = containerAndInfo.getKey();
1155             if (region.equals("001")) {
1156                 continue;
1157             }
1158             Counter<R2<String, String>> data = containerAndInfo.getValue();
1159             Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true);
1160             if (SHOW_CONTAINERS) { // debug
1161                 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null)));
1162                 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region));
1163             }
1164             R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative
1165             final Comparable<String> language = value.get0();
1166             final Comparable<String> script = value.get1();
1167 
1168             // fix special cases like es-419, where a locale exists.
1169             // for those cases, what we add as output is the container. Otherwise the region.
1170             Set<String> skipLanguages = cldrContainerToLanguages.get(region);
1171             if (skipLanguages != null
1172                 && skipLanguages.contains(language)) {
1173                 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS",
1174                     LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1175                 continue;
1176             }
1177 
1178             // we now have the best language and script. Find the best region for that
1179             for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) {
1180                 final Comparable<String> language2 = e.get1();
1181                 final Comparable<String> script2 = e.get2();
1182                 if (language2.equals(language) && script2.equals(script)) {
1183                     add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS",
1184                         LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1185                     break;
1186                 }
1187             }
1188         }
1189 
1190         for (R2<String, String> languageScript : maxData.languageScripts.keySet()) {
1191             R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next();
1192             final Comparable<String> language = languageScript.get0();
1193             final Comparable<String> script = languageScript.get1();
1194             final Comparable<String> region = value.get1();
1195             add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R",
1196                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1197         }
1198 
1199         for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) {
1200             R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next();
1201             final Comparable<String> script = scriptRegion.get0();
1202             final Comparable<String> region = scriptRegion.get1();
1203             final Comparable<String> language = value.get1();
1204             add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L",
1205                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1206         }
1207 
1208         for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) {
1209             R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next();
1210             final Comparable<String> language = languageRegion.get0();
1211             final Comparable<String> region = languageRegion.get1();
1212             final Comparable<String> script = value.get1();
1213             add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S",
1214                 LocaleOverride.REPLACE_EXISTING, SHOW_ADD);
1215         }
1216 
1217         // get the script info from metadata as fallback
1218 
1219 
1220         TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts());
1221         for (String script : sorted) {
1222             Info i = ScriptMetadata.getInfo(script);
1223             String likelyLanguage = i.likelyLanguage;
1224             if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) {
1225                 likelyLanguage = "und";
1226             }
1227             String originCountry = i.originCountry;
1228             final String result = likelyLanguage + "_" + script + "_" + originCountry;
1229             add("und_" + script, result, toMaximized, "S->LR•",
1230                 LocaleOverride.KEEP_EXISTING, SHOW_ADD);
1231             add(likelyLanguage, result, toMaximized, "L->SR•",
1232                 LocaleOverride.KEEP_EXISTING, SHOW_ADD);
1233         }
1234 
1235         // add overrides
1236         for (String key : LANGUAGE_OVERRIDES.keySet()) {
1237             add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", LocaleOverride.REPLACE_EXISTING, true);
1238         }
1239 
1240         // Make sure that the mapping is Idempotent. If we have A ==> B, we must never have B ==> C
1241         // We run this check until we get no problems.
1242         Set<List<String>> problems = new HashSet<>();
1243 
1244         while (true) {
1245             problems.clear();
1246             for (Entry<String, String> entry : toMaximized.entrySet()) {
1247                 String source = entry.getKey();
1248                 String target = entry.getValue();
1249                 if (target.contains("_Zzzz") || target.contains("_ZZ")) { // these are special cases
1250                     continue;
1251                 }
1252                 String idempotentCandidate = LikelySubtags.maximize(target, toMaximized);
1253 
1254                 if (idempotentCandidate == null) {
1255                     System.out.println("Can't maximize " + target);
1256                 } else if (!idempotentCandidate.equals(target)) {
1257                     problems.add(ImmutableList.of(source, target, idempotentCandidate));
1258                 }
1259             }
1260             if (problems.isEmpty()) {
1261                 break;
1262             }
1263             for (List<String> row : problems) {
1264                 System.out.println("Idempotence: dropping mapping " + row.get(0) + " to " + row.get(1) + " since the target maps further to " + row.get(2));
1265                 toMaximized.remove(row.get(0));
1266             }
1267         }
1268     }
1269 
shorten(Object data)1270     public static String shorten(Object data) {
1271         String info = data.toString();
1272         if (info.length() > 255) {
1273             info = info.substring(0, 127) + "…";
1274         }
1275         return info;
1276     }
1277 
doAlt(Map<String, String> toMaximized)1278     private static void doAlt(Map<String, String> toMaximized) {
1279         // TODO Auto-generated method stub
1280         Map<String, String> temp = new TreeMap<>();
1281         for (String locale : toMaximized.keySet()) {
1282             String target = toMaximized.get(locale);
1283             temp.put(toAlt(locale, true), toAlt(target, true));
1284         }
1285         toMaximized.clear();
1286         toMaximized.putAll(temp);
1287     }
1288 
maximize(String languageTag, Map<String, String> toMaximized)1289     public static String maximize(String languageTag, Map<String, String> toMaximized) {
1290         LanguageTagParser ltp = new LanguageTagParser();
1291 
1292         // clean up the input by removing Zzzz, ZZ, and changing "" into und.
1293         ltp.set(languageTag);
1294         String language = ltp.getLanguage();
1295         String region = ltp.getRegion();
1296         String script = ltp.getScript();
1297         boolean changed = false;
1298         if (language.equals("")) {
1299             ltp.setLanguage(language = "und");
1300             changed = true;
1301         }
1302         if (region.equals(UNKNOWN_SCRIPT)) {
1303             ltp.setScript(script = "");
1304             changed = true;
1305         }
1306         if (ltp.getRegion().equals(UNKNOWN_REGION)) {
1307             ltp.setRegion(region = "");
1308             changed = true;
1309         }
1310         if (changed) {
1311             languageTag = ltp.toString();
1312         }
1313         // check whole
1314         String result = toMaximized.get(languageTag);
1315         if (result != null) {
1316             return result;
1317         }
1318         // try empty region
1319         if (region.length() != 0) {
1320             result = toMaximized.get(ltp.setRegion("").toString());
1321             if (result != null) {
1322                 return ltp.set(result).setRegion(region).toString();
1323             }
1324             ltp.setRegion(region); // restore
1325         }
1326         // try empty script
1327         if (script.length() != 0) {
1328             result = toMaximized.get(ltp.setScript("").toString());
1329             if (result != null) {
1330                 return ltp.set(result).setScript(script).toString();
1331             }
1332             // try empty script and region
1333             if (region.length() != 0) {
1334                 result = toMaximized.get(ltp.setRegion("").toString());
1335                 if (result != null) {
1336                     return ltp.set(result).setScript(script).setRegion(region).toString();
1337                 }
1338             }
1339         }
1340         if (!language.equals("und") && script.length() != 0 && region.length() != 0) {
1341             return languageTag; // it was ok, and we couldn't do anything with it
1342         }
1343         return null; // couldn't maximize
1344     }
1345 
minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1346     public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) {
1347         if (input.equals("nb_Latn_SJ")) {
1348             System.out.print(""); // debug
1349         }
1350         String maximized = maximize(input, toMaximized);
1351         if (maximized == null) {
1352             return null; // failed
1353         }
1354         LanguageTagParser ltp = new LanguageTagParser().set(maximized);
1355         String language = ltp.getLanguage();
1356         String region = ltp.getRegion();
1357         String script = ltp.getScript();
1358         // try building up from shorter to longer, and find the first that matches
1359         // could be more optimized, but for this code we want simplest
1360         String[] trials = { language,
1361             language + TAG_SEPARATOR + (favorRegion ? region : script),
1362             language + TAG_SEPARATOR + (!favorRegion ? region : script) };
1363         for (String trial : trials) {
1364             String newMaximized = maximize(trial, toMaximized);
1365             if (maximized.equals(newMaximized)) {
1366                 return trial;
1367             }
1368         }
1369         return maximized;
1370     }
1371 
1372     // /**
1373     // * Verify that we can map from each language, script, and country to something.
1374     // * @param toMaximized
1375     // */
1376     // private static void checkConsistency(Map<String, String> toMaximized) {
1377     // Map<String,String> needMappings = new TreeMap();
1378     // LanguageTagParser parser = new LanguageTagParser();
1379     // for (String maximized : new TreeSet<String>(toMaximized.values())) {
1380     // parser.set(maximized);
1381     // final String language = parser.getLanguage();
1382     // final String script = parser.getScript();
1383     // final String region = parser.getRegion();
1384     // if (language.length() == 0 || script.length() == 0 || region.length() == 0) {
1385     // failure("   { \"" + maximized + "\", \"" + maximized + "\" },   //     " + english.getName(maximized) +
1386     // "\t\tFailed-Consistency");
1387     // continue;
1388     // }
1389     // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency");
1390     // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency");
1391     // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1392     // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency");
1393     // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1394     // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency");
1395     // }
1396     // toMaximized.putAll(needMappings);
1397     // }
1398 
1399     // private static void failure(String string) {
1400     // System.out.println(string);
1401     // errorCount++;
1402     // }
1403 
1404     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String>
1405     // otherToCheck, String kind) {
1406     // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind);
1407     // }
1408 
1409     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey,
1410     // Set<String> skipValue, String kind) {
1411     // if (!key.equals(value)
1412     // && !toAdd.containsKey(key)
1413     // && (skipKey == null || !skipKey.contains(key))
1414     // && (skipValue == null || !skipValue.contains(value))) {
1415     // add(key, value, toAdd, kind);
1416     // }
1417     // }
1418 
1419     enum LocaleOverride {
1420         KEEP_EXISTING, REPLACE_EXISTING
1421     }
1422 
add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, boolean showAction)1423     private static void add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override,
1424         boolean showAction) {
1425         if (SHOW_ADD && key.startsWith("mis")) {
1426             int debug = 1;
1427         }
1428         if (key.equals(DEBUG_ADD_KEY)) {
1429             System.out.println("*debug*");
1430         }
1431         String oldValue = toAdd.get(key);
1432         if (oldValue == null) {
1433             if (showAction) {
1434                 System.out.println("\tAdding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind);
1435             }
1436         } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) {
1437             // if (showAction) {
1438             // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind);
1439             // }
1440             return;
1441         } else {
1442             if (showAction) {
1443                 System.out.println("\tReplacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind);
1444             }
1445         }
1446         toAdd.put(key, value);
1447     }
1448 
getName(String value)1449     private static String getName(String value) {
1450         return ConvertLanguageData.getLanguageCodeAndName(value);
1451     }
1452 
printLikelySubtags(Map<String, String> fluffup)1453     private static File printLikelySubtags(Map<String, String> fluffup) throws IOException {
1454         final File genDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental");
1455         final File genFile = new File(genDir, "likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt"));
1456         System.out.println("Writing to " + genFile);
1457 
1458         try(PrintWriter out = FileUtilities.openUTF8Writer(genFile)) {
1459             String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " ";
1460             String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {"
1461                 : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR
1462                     + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">"
1463                     + CldrUtility.LINE_SEPARATOR
1464                     + "<!--"
1465                     + CldrUtility.LINE_SEPARATOR
1466                     + CldrUtility.getCopyrightString()
1467                     + CldrUtility.LINE_SEPARATOR
1468                     + "-->"
1469                     + CldrUtility.LINE_SEPARATOR
1470                     + "<!--"
1471                     + CldrUtility.LINE_SEPARATOR
1472                     + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR
1473                     + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR
1474                     + "not be patched by hand, as any changes made in that fashion may be lost."
1475                     + CldrUtility.LINE_SEPARATOR
1476                     + "-->"
1477                     + CldrUtility.LINE_SEPARATOR
1478                     + "<supplementalData>" + CldrUtility.LINE_SEPARATOR
1479                     + "    <version number=\"$" +
1480                     "Revision$\"/>" + CldrUtility.LINE_SEPARATOR
1481                     + "    <likelySubtags>";
1482             String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};"
1483                 : "    </likelySubtags>" + CldrUtility.LINE_SEPARATOR
1484                     + "</supplementalData>";
1485             out.println(header);
1486             boolean first = true;
1487             Set<String> keys = new TreeSet<>(new LocaleStringComparator());
1488             keys.addAll(fluffup.keySet());
1489             for (String printingLocale : keys) {
1490                 String printingTarget = fluffup.get(printingLocale);
1491                 String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing
1492                     + printingName(printingTarget, spacing);
1493 
1494                 if (OUTPUT_STYLE == OutputStyle.XML) {
1495                     out.println("\t\t<likelySubtag from=\"" + printingLocale +
1496                         "\" to=\"" + printingTarget + "\"" +
1497                         "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->");
1498                 } else {
1499                     if (first) {
1500                         first = false;
1501                     } else {
1502                         out.print(",");
1503                     }
1504                     if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) {
1505                         comment = printingName(printingLocale, spacing) + SEPARATOR + "    // " + spacing + "=>" + spacing
1506                             + printingName(printingTarget, spacing);
1507                     }
1508                     out.print(
1509                         "  {"
1510                             + SEPARATOR + "    // " + comment
1511                             + SEPARATOR + "    \"" + printingLocale + "\","
1512                             + SEPARATOR + "    \"" + printingTarget + "\""
1513                             + CldrUtility.LINE_SEPARATOR + "  }");
1514                 }
1515             }
1516             out.println(footer);
1517             out.close();
1518         }
1519         return genFile;
1520     }
1521 
printingName(String locale, String spacing)1522     public static String printingName(String locale, String spacing) {
1523         if (locale == null) {
1524             return null;
1525         }
1526         LanguageTagParser parser = new LanguageTagParser().set(locale);
1527         String lang = parser.getLanguage();
1528         String script = parser.getScript();
1529         String region = parser.getRegion();
1530         return "{" + spacing +
1531             (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing +
1532             (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing
1533             +
1534             (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing
1535             + "}";
1536     }
1537 
1538     private static final String[][] ALT_REVERSAL = {
1539         //{ "no", "nb" },
1540         //{ "nb", "no" },
1541         { "he", "iw" },
1542         { "iw", "he" },
1543     };
1544 
toAlt(String locale, boolean change)1545     public static String toAlt(String locale, boolean change) {
1546         if (!change || locale == null) {
1547             return locale;
1548         }
1549         String firstTag = getFirstTag(locale);
1550         for (String[] pair : ALT_REVERSAL) {
1551             if (firstTag.equals(pair[0])) {
1552                 locale = pair[1] + locale.substring(pair[1].length());
1553                 break;
1554             }
1555         }
1556         locale = locale.replace("_", "-");
1557         return locale;
1558     }
1559 
getFirstTag(String locale)1560     private static String getFirstTag(String locale) {
1561         int pos = locale.indexOf('_');
1562         return pos < 0 ? locale : locale.substring(0, pos);
1563     }
1564 
1565     // private static Map<String, String> getBackMapping(Map<String, String> fluffup) {
1566     // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR);
1567     // for (String source : fluffup.keySet()) {
1568     // if (source.startsWith("und")) {
1569     // continue;
1570     // }
1571     // String maximized = fluffup.get(source);
1572     // backMap.put(maximized, source); // put in right order
1573     // }
1574     // Map<String,String> returnBackMap = new TreeMap();
1575     // for (String maximized : backMap.keySet()) {
1576     // final Set<String> all = backMap.getAll(maximized);
1577     // final String minimized = all.iterator().next();
1578     // returnBackMap.put(maximized, minimized);
1579     // }
1580     // return returnBackMap;
1581     // }
1582 
1583     /**
1584      * Language tags are presumed to share the first language, except possibly "und". Best is least
1585      */
1586     // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() {
1587     // LanguageTagParser p1 = new LanguageTagParser();
1588     // LanguageTagParser p2 = new LanguageTagParser();
1589     // public int compare(String o1, String o2) {
1590     // if (o1.equals(o2)) return 0;
1591     // p1.set(o1);
1592     // p2.set(o2);
1593     // String lang1 = p1.getLanguage();
1594     // String lang2 = p2.getLanguage();
1595     //
1596     // // compare languages first
1597     // // put und at the end
1598     // int result = lang1.compareTo(lang2);
1599     // if (result != 0) {
1600     // if (lang1.equals("und")) return 1;
1601     // if (lang2.equals("und")) return -1;
1602     // return result;
1603     // }
1604     //
1605     // // now scripts and regions.
1606     // // if they have different numbers of fields, the shorter wins.
1607     // // If there are two fields, region is lowest.
1608     // // The simplest way is to just compare scripts first
1609     // // so zh-TW < zh-Hant, because we first compare "" to Hant
1610     // String script1 = p1.getScript();
1611     // String script2 = p2.getScript();
1612     // int scriptOrder = script1.compareTo(script2);
1613     // if (scriptOrder != 0) return scriptOrder;
1614     //
1615     // String region1 = p1.getRegion();
1616     // String region2 = p2.getRegion();
1617     // int regionOrder = region1.compareTo(region2);
1618     // if (regionOrder != 0) return regionOrder;
1619     //
1620     // return o1.compareTo(o2);
1621     // }
1622     //
1623     // };
1624 
minimize(Map<String, String> fluffup)1625     public static void minimize(Map<String, String> fluffup) {
1626         LanguageTagParser parser = new LanguageTagParser();
1627         LanguageTagParser targetParser = new LanguageTagParser();
1628         Set<String> removals = new TreeSet<>();
1629         while (true) {
1630             removals.clear();
1631             for (String locale : fluffup.keySet()) {
1632                 String target = fluffup.get(locale);
1633                 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) {
1634                     removals.add(locale);
1635                     if (SHOW_ADD)
1636                         System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1637                             + "\t\t - Unknown Region in target");
1638                     continue;
1639                 }
1640                 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) {
1641                     removals.add(locale);
1642                     if (SHOW_ADD)
1643                         System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1644                             + "\t\t - Unknown Script in target");
1645                     continue;
1646                 }
1647 
1648                 String region = parser.set(locale).getRegion();
1649                 if (region.length() != 0) {
1650                     if (region.equals(UNKNOWN_REGION)) {
1651                         removals.add(locale);
1652                         if (SHOW_ADD)
1653                             System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1654                                 + "\t\t - Unknown Region in source");
1655                         continue;
1656                     }
1657                     parser.setRegion("");
1658                     String newLocale = parser.toString();
1659                     String newTarget = fluffup.get(newLocale);
1660                     if (newTarget != null) {
1661                         newTarget = targetParser.set(newTarget).setRegion(region).toString();
1662                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
1663                             removals.add(locale);
1664                             if (SHOW_ADD)
1665                                 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
1666                                     + newLocale);
1667                             continue;
1668                         }
1669                     }
1670                 }
1671                 String script = parser.set(locale).getScript();
1672                 if (locale.equals(DEBUG_ADD_KEY)) {
1673                     System.out.println("*debug*");
1674                 }
1675                 if (script.length() != 0) {
1676                     if (script.equals(UNKNOWN_SCRIPT)) {
1677                         removals.add(locale);
1678                         if (SHOW_ADD)
1679                             System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script");
1680                         continue;
1681                     }
1682                     parser.setScript("");
1683                     String newLocale = parser.toString();
1684                     String newTarget = fluffup.get(newLocale);
1685                     if (newTarget != null) {
1686                         newTarget = targetParser.set(newTarget).setScript(script).toString();
1687                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
1688                             removals.add(locale);
1689                             if (SHOW_ADD)
1690                                 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
1691                                     + newLocale);
1692                             continue;
1693                         }
1694                     }
1695                 }
1696             }
1697             if (removals.size() == 0) {
1698                 break;
1699             }
1700             for (String locale : removals) {
1701                 fluffup.remove(locale);
1702             }
1703         }
1704     }
1705 
1706     // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) {
1707     // // add script
1708     // Map<String, String> temp = new TreeMap<String, String>();
1709     // while (true) {
1710     // temp.clear();
1711     // for (String target : new TreeSet<String>(fluffup.values())) {
1712     // parser.set(target);
1713     // final String territory = parser.getRegion();
1714     // if (territory.length() == 0) {
1715     // continue;
1716     // }
1717     // parser.setRegion("");
1718     // String possibleSource = parser.toString();
1719     // if (fluffup.containsKey(possibleSource)) {
1720     // continue;
1721     // }
1722     // String other = temp.get(possibleSource);
1723     // if (other != null) {
1724     // if (!target.equals(other)) {
1725     // System.out.println("**Failure with multiple sources in addLanguageScript: "
1726     // + possibleSource + "\t=>\t" + target + ", " + other);
1727     // }
1728     // continue;
1729     // }
1730     // temp.put(possibleSource, target);
1731     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script");
1732     // }
1733     // if (temp.size() == 0) {
1734     // break;
1735     // }
1736     // fluffup.putAll(temp);
1737     // }
1738     //
1739     // }
1740 
1741     // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) {
1742     // // add script
1743     // Map<String, String> temp = new TreeMap<String, String>();
1744     // while (true) {
1745     // temp.clear();
1746     // for (String target : new TreeSet<String>(fluffup.values())) {
1747     // parser.set(target);
1748     // String script = parser.getScript();
1749     // if (script.length() == 0) {
1750     // continue;
1751     // }
1752     // parser.setScript("");
1753     // String possibleSource = parser.toString();
1754     // if (fluffup.containsKey(possibleSource)) {
1755     // continue;
1756     // }
1757     // String other = temp.get(possibleSource);
1758     //
1759     // if (other != null) {
1760     // if (!target.equals(other)) {
1761     // script = getScriptForLocale(possibleSource);
1762     // if (script == null) {
1763     // System.out.println("**Failure with multiple sources in addLanguageCountry: "
1764     // + possibleSource + "\t=>\t" + target + ", " + other);
1765     // continue; // error message in routine
1766     // }
1767     // parser.setScript(script);
1768     // target = parser.toString();
1769     // }
1770     // }
1771     //
1772     // temp.put(possibleSource, target);
1773     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry");
1774     // }
1775     // if (temp.size() == 0) {
1776     // break;
1777     // }
1778     // fluffup.putAll(temp);
1779     // }
1780     //
1781     // }
1782 
1783     // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) {
1784     // // add script
1785     // Map<String, String> temp = new TreeMap<String, String>();
1786     // while (true) {
1787     // temp.clear();
1788     // Set skipTarget = fluffup.keySet();
1789     // for (String locale : fluffup.keySet()) {
1790     // String target = fluffup.get(locale);
1791     // parser.set(target);
1792     // if (parser.getScript().length() != 0) {
1793     // continue;
1794     // }
1795     // String script = getScriptForLocale(target);
1796     //
1797     // if (script == null) {
1798     // continue; // error message in routine
1799     // }
1800     // parser.setScript(script);
1801     // String furtherTarget = parser.toString();
1802     // addIfNotIn(target, furtherTarget, temp, fluffup, "Script");
1803     // }
1804     // if (temp.size() == 0) {
1805     // break;
1806     // }
1807     // fluffup.putAll(temp);
1808     // }
1809     // }
1810 
1811     // private static String getScriptForLocale(String locale) {
1812     // String result = getScriptForLocale2(locale);
1813     // if (result != null) return result;
1814     // int pos = locale.indexOf('_');
1815     // if (pos >= 0) {
1816     // result = getScriptForLocale2(locale.substring(0,pos));
1817     // }
1818     // return result;
1819     // }
1820 
1821     private static String UNKNOWN_SCRIPT = "Zzzz";
1822     private static String UNKNOWN_REGION = "ZZ";
1823 
getScriptForLocale2(String locale)1824     private static String getScriptForLocale2(String locale) {
1825         String result = localeToScriptCache.get(locale);
1826         if (result != null) {
1827             return result;
1828         }
1829         if (locale.equals("ky")) {
1830             int debug = 0;
1831         }
1832         try {
1833             Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale);
1834             if (data != null) {
1835                 for (BasicLanguageData datum : data.values()) {
1836                     final Set<String> scripts = datum.getScripts();
1837                     boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary;
1838                     if (scripts.size() != 1) {
1839                         if (scripts.size() > 1 && isPrimary) {
1840                             break;
1841                         }
1842                         continue;
1843                     }
1844                     String script = scripts.iterator().next();
1845                     if (isPrimary) {
1846                         return result = script;
1847                     } else if (result == null) {
1848                         result = script;
1849                     }
1850                 }
1851                 if (result != null) {
1852                     return result;
1853                 }
1854             }
1855             CLDRFile cldrFile;
1856             try {
1857                 cldrFile = factory.make(locale, true);
1858             } catch (RuntimeException e) {
1859                 result = FALLBACK_SCRIPTS.get(locale);
1860                 if (result == null) {
1861                     System.err.println("***Failed to find script in L-S-R or MAX_ADDITIONS for: " + locale + "\t" + english.getName(locale));
1862                     return result = UNKNOWN_SCRIPT;
1863                 } else {
1864                     return result;
1865                 }
1866             }
1867             UnicodeSet exemplars = getExemplarSet(cldrFile, "");
1868             Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars);
1869             CLDRScripts.remove(UNKNOWN_SCRIPT);
1870             if (CLDRScripts.size() == 1) {
1871                 return result = CLDRScripts.iterator().next();
1872             } else if (CLDRScripts.size() == 0) {
1873                 System.out.println("**Failed to get script for:\t" + locale);
1874                 return result = UNKNOWN_SCRIPT;
1875             } else {
1876                 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts);
1877                 return result = UNKNOWN_SCRIPT;
1878             }
1879         } finally {
1880             if (result.equals(UNKNOWN_SCRIPT)) {
1881                 String temp = LANGUAGE_OVERRIDES.get(locale);
1882                 if (temp != null) {
1883                     result = new LanguageTagParser().set(temp).getScript();
1884                     System.err.println("***Warning, Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result);
1885                 }
1886             }
1887             localeToScriptCache.put(locale, result);
1888             if (SHOW_ADD)
1889                 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t"
1890                     + english.getName(CLDRFile.SCRIPT_NAME, result));
1891         }
1892     }
1893 
1894     // private static Map<String, String> closeMapping(Map<String, String> fluffup) {
1895     // if (SHOW_ADD) System.out.flush();
1896     // Map<String,String> temp = new TreeMap<String,String>();
1897     // while (true) {
1898     // temp.clear();
1899     // for (String locale : fluffup.keySet()) {
1900     // String target = fluffup.get(locale);
1901     // if (target.equals("si_Sinh") || target.equals("zh-Hani")) {
1902     // System.out.println("????");
1903     // }
1904     // String furtherTarget = fluffup.get(target);
1905     // if (furtherTarget == null) {
1906     // continue;
1907     // }
1908     // addIfNotIn(locale, furtherTarget, temp, null, "Close");
1909     // }
1910     // if (temp.size() == 0) {
1911     // break;
1912     // }
1913     // fluffup.putAll(temp);
1914     // }
1915     // if (SHOW_ADD) System.out.flush();
1916     // return temp;
1917     // }
1918 
getScriptsFromUnicodeSet(UnicodeSet exemplars)1919     public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) {
1920         // use bits first, since that's faster
1921         BitSet scriptBits = new BitSet();
1922         boolean show = false;
1923         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
1924             if (show)
1925                 System.out.println(Integer.toHexString(it.codepoint));
1926             if (it.codepoint != UnicodeSetIterator.IS_STRING) {
1927                 scriptBits.set(UScript.getScript(it.codepoint));
1928             } else {
1929                 int cp;
1930                 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) {
1931                     scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i)));
1932                 }
1933             }
1934         }
1935         scriptBits.clear(UScript.COMMON);
1936         scriptBits.clear(UScript.INHERITED);
1937         Set<String> scripts = new TreeSet<>();
1938         for (int j = 0; j < scriptBits.size(); ++j) {
1939             if (scriptBits.get(j)) {
1940                 scripts.add(UScript.getShortName(j));
1941             }
1942         }
1943         return scripts;
1944     }
1945 
getExemplarSet(CLDRFile cldrfile, String type)1946     public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) {
1947         if (type.length() != 0)
1948             type = "[@type=\"" + type + "\"]";
1949         String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters"
1950             + type);
1951         if (v == null)
1952             return new UnicodeSet();
1953         return new UnicodeSet(v);
1954     }
1955 
1956     // private static String[][] SpecialCases = {
1957     // { "zh_Hani", "zh_Hans_CN"},
1958     // { "si_Sinh", "si_Sinh_LK"},
1959     // { "ii", "ii_CN"}, // Sichuan Yi (Yi)
1960     // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics)
1961     // { "und", "en"}, // English default
1962     // };
1963 
showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)1964     static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) {
1965         Set<String> errors = new LinkedHashSet<>();
1966         Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(
1967             ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors);
1968         if (!errors.isEmpty()) {
1969             System.out.println(Joiner.on("\n").join(errors));
1970             errors.clear();
1971         }
1972         Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
1973             new TreeMap<String, String>(), errors);
1974         if (!errors.isEmpty()) {
1975             System.out.println("Default Content errors: " + Joiner.on("\n").join(errors));
1976             errors.clear();
1977         }
1978         Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent,
1979             "ar", "ar_001");
1980         System.out.println(Joiner.on("\n").join(changes));
1981         defaultLocaleContent.clear();
1982         defaultLocaleContent.addAll(newDefaultContent.values());
1983         newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
1984             new TreeMap<String, String>(), errors);
1985         if (!errors.isEmpty()) {
1986             System.out.println("***New Errors: " + Joiner.on("\n").join(errors));
1987         }
1988     }
1989 
compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)1990     private static Set<String> compareMapsAndFixNew(String title,
1991         Map<String, String> oldContent,
1992         Map<String, String> newContent, String... allowedOverrideValues) {
1993         Map<String, String> allowedOverrideValuesTest = new HashMap<>();
1994         for (int i = 0; i < allowedOverrideValues.length; i += 2) {
1995             allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]);
1996         }
1997         Set<String> changes = new TreeSet<>();
1998         for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet())
1999             .addAll(oldContent.keySet()).get()) {
2000             String oldValue = oldContent.get(parent);
2001             String newValue = newContent.get(parent);
2002             String overrideValue = allowedOverrideValuesTest.get(parent);
2003             if (overrideValue != null) {
2004                 newContent.put(parent, overrideValue);
2005                 newValue = overrideValue;
2006             }
2007             if (CldrUtility.equals(oldValue, newValue)) {
2008                 continue;
2009             }
2010             String message;
2011             if (oldValue == null) {
2012                 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2013                     + ConvertLanguageData.getLanguageCodeAndName(newValue);
2014                 newContent.put(parent, newValue);
2015             } else if (newValue == null) {
2016                 if (SUPPRESS_CHANGES) {
2017                     message = "Suppressing removal of "
2018                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2019                         + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2020                     newContent.put(parent, oldValue);
2021                 } else {
2022                     message = "Removing "
2023                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2024                         + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2025                     newContent.remove(oldValue);
2026                 }
2027             } else {
2028                 if (SUPPRESS_CHANGES) {
2029                     message = "Suppressing change of "
2030                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2031                         + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
2032                         + ConvertLanguageData.getLanguageCodeAndName(newValue);
2033                     newContent.remove(newValue);
2034                     newContent.put(parent, oldValue);
2035                 } else {
2036                     message = "Changing "
2037                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2038                         + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
2039                         + ConvertLanguageData.getLanguageCodeAndName(newValue);
2040                     newContent.remove(oldValue);
2041                     newContent.put(parent, newValue);
2042                 }
2043             }
2044             changes.add(title + message);
2045         }
2046         return changes;
2047     }
2048 
2049     public static class LocaleStringComparator implements Comparator<String> {
2050         LanguageTagParser ltp0 = new LanguageTagParser();
2051         LanguageTagParser ltp1 = new LanguageTagParser();
2052 
2053         @Override
compare(String arg0, String arg1)2054         public int compare(String arg0, String arg1) {
2055             ltp0.set(arg0);
2056             ltp1.set(arg1);
2057             String s0 = ltp0.getLanguage();
2058             String s1 = ltp1.getLanguage();
2059             int result = s0.compareTo(s1);
2060             if (result != 0) {
2061                 return s0.equals("und") ? 1
2062                     : s1.equals("und") ? -1
2063                         : result;
2064             }
2065             s0 = ltp0.getScript();
2066             s1 = ltp1.getScript();
2067             result = s0.compareTo(s1);
2068             if (result != 0) {
2069                 return result;
2070             }
2071             s0 = ltp0.getRegion();
2072             s1 = ltp1.getRegion();
2073             result = s0.compareTo(s1);
2074             if (result != 0) {
2075                 return result;
2076             }
2077             return arg0.compareTo(arg1); // just in case
2078         }
2079 
2080     }
2081 }
2082