• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.PrintWriter;
7 import java.util.Arrays;
8 import java.util.BitSet;
9 import java.util.Collection;
10 import java.util.Comparator;
11 import java.util.HashMap;
12 import java.util.HashSet;
13 import java.util.LinkedHashSet;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.Map.Entry;
17 import java.util.Set;
18 import java.util.TreeMap;
19 import java.util.TreeSet;
20 
21 import org.unicode.cldr.draft.FileUtilities;
22 import org.unicode.cldr.draft.ScriptMetadata;
23 import org.unicode.cldr.draft.ScriptMetadata.Info;
24 import org.unicode.cldr.util.Builder;
25 import org.unicode.cldr.util.CLDRFile;
26 import org.unicode.cldr.util.CLDRLocale;
27 import org.unicode.cldr.util.CLDRPaths;
28 import org.unicode.cldr.util.CldrUtility;
29 import org.unicode.cldr.util.Containment;
30 import org.unicode.cldr.util.Counter;
31 import org.unicode.cldr.util.Factory;
32 import org.unicode.cldr.util.Iso639Data;
33 import org.unicode.cldr.util.Iso639Data.Scope;
34 import org.unicode.cldr.util.LanguageTagParser;
35 import org.unicode.cldr.util.LocaleIDParser;
36 import org.unicode.cldr.util.Log;
37 import org.unicode.cldr.util.PatternCache;
38 import org.unicode.cldr.util.SimpleFactory;
39 import org.unicode.cldr.util.StandardCodes;
40 import org.unicode.cldr.util.SupplementalDataInfo;
41 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
42 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
43 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
44 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
45 
46 import com.google.common.collect.ImmutableMap;
47 import com.google.common.collect.ImmutableSet;
48 import com.ibm.icu.dev.util.CollectionUtilities;
49 import com.ibm.icu.impl.Relation;
50 import com.ibm.icu.impl.Row;
51 import com.ibm.icu.impl.Row.R2;
52 import com.ibm.icu.impl.Row.R3;
53 import com.ibm.icu.impl.Row.R4;
54 import com.ibm.icu.lang.UScript;
55 import com.ibm.icu.text.Collator;
56 import com.ibm.icu.text.NumberFormat;
57 import com.ibm.icu.text.UTF16;
58 import com.ibm.icu.text.UnicodeSet;
59 import com.ibm.icu.text.UnicodeSetIterator;
60 import com.ibm.icu.util.ULocale;
61 
62 /**
63  * Problems:
64  * "und_Hani", "zh_Hani"
65  * "und_Sinh", "si_Sinh"
66  *
67  * @author markdavis
68  *
69  */
70 public class GenerateMaximalLocales {
71 
72     private static final String TEMP_UNKNOWN_REGION = "XZ";
73 
74     private static final String DEBUG_ADD_KEY = "und_Latn_ZA";
75 
76     private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false);
77     private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false);
78     private static final boolean SHOW_CONTAINERS = false;
79 
80     enum OutputStyle {
81         PLAINTEXT, C, C_ALT, XML
82     };
83 
84     private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML")
85         .toUpperCase());
86 
87     // set based on above
88     private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR
89         : "\t";
90     private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_";
91     // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT;
92 
93     private static final boolean tryDifferent = true;
94 
95     private static final File list[] = {
96         new File(CLDRPaths.MAIN_DIRECTORY),
97         new File(CLDRPaths.SEED_DIRECTORY),
98         new File(CLDRPaths.EXEMPLARS_DIRECTORY) };
99 
100     private static Factory factory = SimpleFactory.make(list, ".*");
101     private static SupplementalDataInfo supplementalData = SupplementalDataInfo
102         .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);
103     private static StandardCodes standardCodes = StandardCodes.make();
104     private static CLDRFile english = factory.make("en", false);
105     static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
106     static {
107         for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) {
108             String region = locale.getCountry();
109             if (region == null || region.isEmpty() || Containment.isLeaf(region)) {
110                 continue;
111             }
cldrContainerToLanguages.put(region, locale.getLanguage())112             cldrContainerToLanguages.put(region, locale.getLanguage());
113         }
cldrContainerToLanguages.freeze()114         cldrContainerToLanguages.freeze();
115         System.out.println("Keep containers " + cldrContainerToLanguages);
116     }
117 
118     private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET");
119     private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr");
120 
121     /**
122      * This is the simplest way to override, by supplying the max value.
123      * It gets a very low weight, so doesn't override any stronger value.
124      */
125     private static final String[] MAX_ADDITIONS = new String[] {
126         "bss_Latn_CM",
127         "gez_Ethi_ET",
128         "ken_Latn_CM",
129         "und_Arab_PK",
130         "wa_Latn_BE",
131 
132         "fub_Arab_CM",
133         "fuf_Latn_GN",
134         "kby_Arab_NE",
135         "kdh_Arab_TG",
136         "apd_Arab_TG",
137         "zlm_Latn_TG",
138 
139         "cr_Cans_CA",
140         "hif_Latn_FJ",
141         "gon_Telu_IN",
142         "lzz_Latn_TR",
143         "lif_Deva_NP",
144         "unx_Beng_IN",
145         "unr_Beng_IN",
146         "ttt_Latn_AZ",
147         "pnt_Grek_GR",
148         "tly_Latn_AZ",
149         "tkr_Latn_AZ",
150         "bsq_Bass_LR",
151         "ccp_Cakm_BD",
152         "blt_Tavt_VN",
153         "rhg_Arab_MM",
154         "rhg_Rohg_MM",
155     };
156 
157     /**
158      * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS.
159      * However, if you add, add both the language and language+script mappings.
160      */
161     // Many of the overrides below can be removed once the language/pop/country data is updated.
162     private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] {
163         { "eo", "eo_Latn_001" },
164         { "eo_Latn", "eo_Latn_001" },
165         { "es", "es_Latn_ES" },
166         { "es_Latn", "es_Latn_ES" },
167         { "ff_BF", "ff_Latn_BF" },
168         { "ff_GM", "ff_Latn_GM" },
169         { "ff_GH", "ff_Latn_GH" },
170         { "ff_GW", "ff_Latn_GW" },
171         { "ff_LR", "ff_Latn_LR" },
172         { "ff_NE", "ff_Latn_NE" },
173         { "ff_NG", "ff_Latn_NG" },
174         { "ff_SL", "ff_Latn_SL" },
175         { "ff_Adlm", "ff_Adlm_GN" },
176         { "ia", "ia_Latn_001" },
177         { "ia_Latn", "ia_Latn_001" },
178         { "io", "io_Latn_001" },
179         { "io_Latn", "io_Latn_001" },
180         { "jbo", "jbo_Latn_001" },
181         { "jbo_Latn", "jbo_Latn_001" },
182         { "ku_Arab", "ku_Arab_IQ" },
183         { "lrc", "lrc_Arab_IR" },
184         { "lrc_Arab", "lrc_Arab_IR" },
185         { "man", "man_Latn_GM" },
186         { "man_Latn", "man_Latn_GM" },
187         { "mas", "mas_Latn_KE" },
188         { "mas_Latn", "mas_Latn_KE" },
189         { "mn", "mn_Cyrl_MN" },
190         { "mn_Cyrl", "mn_Cyrl_MN" },
191         { "mro", "mro_Mroo_BD" },
192         { "mro_BD", "mro_Mroo_BD" },
193         { "ms_Arab", "ms_Arab_MY" },
194         { "pap", "pap_Latn_AW" },
195         { "pap_Latn", "pap_Latn_AW" },
196         { "prg", "prg_Latn_001" },
197         { "prg_Latn", "prg_Latn_001" },
198         { "rif", "rif_Tfng_MA" },
199         { "rif_Latn", "rif_Latn_MA" },
200         { "rif_Tfng", "rif_Tfng_MA" },
201         { "rif_MA", "rif_Tfng_MA" },
202         { "shi", "shi_Tfng_MA" },
203         { "shi_Tfng", "shi_Tfng_MA" },
204         { "shi_MA", "shi_Tfng_MA" },
205         { "sr_Latn", "sr_Latn_RS" },
206         { "ss", "ss_Latn_ZA" },
207         { "ss_Latn", "ss_Latn_ZA" },
208         { "swc", "swc_Latn_CD" },
209         { "ti", "ti_Ethi_ET" },
210         { "ti_Ethi", "ti_Ethi_ET" },
211         { "und", "en_Latn_US" },
212         { "und_Adlm", "ff_Adlm_GN" },
213         { "und_Adlm_GN", "ff_Adlm_GN" },
214         { "und_Arab", "ar_Arab_EG" },
215         { "und_Arab_PK", "ur_Arab_PK" },
216         { "und_Bopo", "zh_Bopo_TW" },
217         { "und_Deva_FJ", "hif_Deva_FJ" },
218         { "und_EZ", "de_Latn_EZ" },
219         { "und_Hani", "zh_Hani_CN" },
220         { "und_Hani_CN", "zh_Hani_CN" },
221         { "und_Kana", "ja_Kana_JP" },
222         { "und_Kana_JP", "ja_Kana_JP" },
223         { "und_Latn", "en_Latn_US" },
224         { "und_Latn_ET", "en_Latn_ET" },
225         { "und_Latn_NE", "ha_Latn_NE" },
226         { "und_Latn_PH", "fil_Latn_PH" },
227         { "und_ML", "bm_Latn_ML" },
228         { "und_Latn_ML", "bm_Latn_ML" },
229         { "und_MU", "mfe_Latn_MU" },
230         { "und_NE", "ha_Latn_NE" },
231         { "und_PH", "fil_Latn_PH" },
232         { "und_PK", "ur_Arab_PK" },
233         { "und_SO", "so_Latn_SO" },
234         { "und_SS", "en_Latn_SS" },
235         { "und_TK", "tkl_Latn_TK" },
236         { "und_UN", "en_Latn_UN" },
237         { "vo", "vo_Latn_001" },
238         { "vo_Latn", "vo_Latn_001" },
239         { "yi", "yi_Hebr_001" },
240         { "yi_Hebr", "yi_Hebr_001" },
241         { "yue", "yue_Hant_HK" },
242         { "yue_Hant", "yue_Hant_HK" },
243         { "yue_Hans", "yue_Hans_CN" },
244         { "yue_CN", "yue_Hans_CN" },
245         { "zh_Hani", "zh_Hani_CN" },
246 
247         { "zh_Bopo", "zh_Bopo_TW" },
248         { "ccp", "ccp_Cakm_BD" },
249         { "ccp_Cakm", "ccp_Cakm_BD" },
250         { "und_Cakm", "ccp_Cakm_BD" },
251         { "cu_Glag", "cu_Glag_BG" },
252         { "sd_Khoj", "sd_Khoj_IN" },
253         { "lif_Limb", "lif_Limb_IN" },
254         { "grc_Linb", "grc_Linb_GR" },
255         { "arc_Nbat", "arc_Nbat_JO" },
256         { "arc_Palm", "arc_Palm_SY" },
257         { "pal_Phlp", "pal_Phlp_CN" },
258         { "en_Shaw", "en_Shaw_GB" },
259         { "sd_Sind", "sd_Sind_IN" },
260         { "und_Brai", "fr_Brai_FR" }, // hack
261         { "und_Hanb", "zh_Hanb_TW" }, // Special script code
262         { "zh_Hanb", "zh_Hanb_TW" }, // Special script code
263         { "und_Jamo", "ko_Jamo_KR" }, // Special script code
264 
265         //{"und_Cyrl_PL", "be_Cyrl_PL"},
266 
267 //        {"cr", "cr_Cans_CA"},
268 //        {"hif", "hif_Latn_FJ"},
269 //        {"gon", "gon_Telu_IN"},
270 //        {"lzz", "lzz_Latn_TR"},
271 //        {"lif", "lif_Deva_NP"},
272 //        {"unx", "unx_Beng_IN"},
273 //        {"unr", "unr_Beng_IN"},
274 //        {"ttt", "ttt_Latn_AZ"},
275 //        {"pnt", "pnt_Grek_GR"},
276 //        {"tly", "tly_Latn_AZ"},
277 //        {"tkr", "tkr_Latn_AZ"},
278 //        {"bsq", "bsq_Bass_LR"},
279 //        {"ccp", "ccp_Cakm_BD"},
280 //        {"blt", "blt_Tavt_VN"},
281         { "mis_Medf", "mis_Medf_NG" },
282     });
283 
284     /**
285      * The following supplements the suppress-script. It overrides info from exemplars and the locale info.
286      */
287     private static String[][] SpecialScripts = {
288         { "zh", "Hans" }, // Hans (not Hani)
289         { "yue", "Hant" }, // Hans (not Hani)
290         { "chk", "Latn" }, // Chuukese (Micronesia)
291         { "fil", "Latn" }, // Filipino (Philippines)"
292         { "ko", "Kore" }, // Korean (North Korea)
293         { "ko_KR", "Kore" }, // Korean (North Korea)
294         { "pap", "Latn" }, // Papiamento (Netherlands Antilles)
295         { "pau", "Latn" }, // Palauan (Palau)
296         { "su", "Latn" }, // Sundanese (Indonesia)
297         { "tet", "Latn" }, // Tetum (East Timor)
298         { "tk", "Latn" }, // Turkmen (Turkmenistan)
299         { "ty", "Latn" }, // Tahitian (French Polynesia)
300         { "ja", "Jpan" }, // Special script for japan
301         { "und", "Latn" }, // Ultimate fallback
302     };
303 
304     private static Map<String, String> localeToScriptCache = new TreeMap<String, String>();
305     static {
306         for (String language : standardCodes.getAvailableCodes("language")) {
307             Map<String, String> info = standardCodes.getLangData("language", language);
308             String script = info.get("Suppress-Script");
309             if (script != null) {
localeToScriptCache.put(language, script)310                 localeToScriptCache.put(language, script);
311             }
312         }
313         for (String[] pair : SpecialScripts) {
localeToScriptCache.put(pair[0], pair[1])314             localeToScriptCache.put(pair[0], pair[1]);
315         }
316     }
317 
318     private static Map<String, String> FALLBACK_SCRIPTS;
319     static {
320         LanguageTagParser additionLtp = new LanguageTagParser();
321         Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>();
322         for (String addition : MAX_ADDITIONS) {
323             additionLtp.set(addition);
324             String lan = additionLtp.getLanguage();
_FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())325             _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript());
326         }
327         FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS);
328     }
329 
330     private static int errorCount;
331 
main(String[] args)332     public static void main(String[] args) throws IOException {
333 
334         printDefaultLanguagesAndScripts();
335 
336         Map<String, String> toMaximized = new TreeMap<String, String>();
337 
338         tryDifferentAlgorithm(toMaximized);
339 
340         minimize(toMaximized);
341 
342         // HACK TEMP_UNKNOWN_REGION
343         // this is to get around the removal of items with ZZ in minimize.
344         // probably cleaner way to do it, but this provides control over just those we want to retain.
345         Set<String> toRemove = new TreeSet<>();
346         Map<String, String> toFix = new TreeMap<>();
347         for (Entry<String, String> entry : toMaximized.entrySet()) {
348             String key = entry.getKey();
349             String value = entry.getValue();
350             if (key.contains(TEMP_UNKNOWN_REGION)) {
351                 toRemove.add(key);
352             } else if (value.contains(TEMP_UNKNOWN_REGION)) {
353                 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION));
354             }
355         }
356         for (String key : toRemove) {
357             toMaximized.remove(key);
358         }
359         toMaximized.putAll(toFix);
360 
361         Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags();
362         Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab",
363             "ms_Arab_ID");
364         System.out.println(CollectionUtilities.join(changes, "\n"));
365 
366         if (OUTPUT_STYLE == OutputStyle.C_ALT) {
367             doAlt(toMaximized);
368         }
369 
370         if (SHOW_ADD)
371             System.out
372                 .println("/*"
373                     + CldrUtility.LINE_SEPARATOR
374                     + " To Maximize:"
375                     +
376                     CldrUtility.LINE_SEPARATOR
377                     + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing."
378                     +
379                     CldrUtility.LINE_SEPARATOR
380                     + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'."
381                     +
382                     CldrUtility.LINE_SEPARATOR
383                     + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions"
384                     +
385                     CldrUtility.LINE_SEPARATOR
386                     + " Try each of the following in order (where the field exists)"
387                     +
388                     CldrUtility.LINE_SEPARATOR
389                     + "   Lookup language-script-region. If in the table, return the result + variants"
390                     +
391                     CldrUtility.LINE_SEPARATOR
392                     + "   Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants"
393                     +
394                     CldrUtility.LINE_SEPARATOR
395                     + "   Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants"
396                     +
397                     CldrUtility.LINE_SEPARATOR
398                     + "   Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants"
399                     +
400                     CldrUtility.LINE_SEPARATOR
401                     +
402                     CldrUtility.LINE_SEPARATOR
403                     + " Example: Input is zh-ZZZZ-SG."
404                     +
405                     CldrUtility.LINE_SEPARATOR
406                     + " Normalize to zh-SG. Lookup in table. No match."
407                     +
408                     CldrUtility.LINE_SEPARATOR
409                     + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG."
410                     +
411                     CldrUtility.LINE_SEPARATOR
412                     +
413                     CldrUtility.LINE_SEPARATOR
414                     + " To Minimize:"
415                     +
416                     CldrUtility.LINE_SEPARATOR
417                     + " First get max = maximize(input)."
418                     +
419                     CldrUtility.LINE_SEPARATOR
420                     + " Then for trial in {language, language-region, language-script}"
421                     +
422                     CldrUtility.LINE_SEPARATOR
423                     + "     If maximize(trial) == max, then return trial."
424                     +
425                     CldrUtility.LINE_SEPARATOR
426                     + " If you don't get a match, return max."
427                     +
428                     CldrUtility.LINE_SEPARATOR
429                     +
430                     CldrUtility.LINE_SEPARATOR
431                     + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW."
432                     +
433                     CldrUtility.LINE_SEPARATOR
434                     + " zh => zh-Hans-CN. No match, so continue."
435                     +
436                     CldrUtility.LINE_SEPARATOR
437                     + " zh-TW => zh-Hans-TW. Match, so return zh-TW."
438                     +
439                     CldrUtility.LINE_SEPARATOR
440                     +
441                     CldrUtility.LINE_SEPARATOR
442                     + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language."
443                     +
444                     CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() +
445                     CldrUtility.LINE_SEPARATOR + "*/");
446 
447         printLikelySubtags(toMaximized);
448 
449         // if (OUTPUT_STYLE != OutputStyle.XML) {
450         // printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null);
451         // }
452 
453         printDefaultContent(toMaximized);
454 
455         System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR);
456 
457     }
458 
459     static class RowData implements Comparable<RowData> {
460         OfficialStatus os;
461         String name;
462         Long pop;
463 
RowData(OfficialStatus os, String name, Long pop)464         public RowData(OfficialStatus os, String name, Long pop) {
465             this.os = os;
466             this.name = name;
467             this.pop = pop;
468         }
469 
getStatus()470         public OfficialStatus getStatus() {
471             // TODO Auto-generated method stub
472             return os;
473         }
474 
getName()475         public CharSequence getName() {
476             // TODO Auto-generated method stub
477             return name;
478         }
479 
getLiteratePopulation()480         public Long getLiteratePopulation() {
481             // TODO Auto-generated method stub
482             return pop;
483         }
484 
compareTo(RowData o)485         public int compareTo(RowData o) {
486             // TODO Auto-generated method stub
487             int result = os.compareTo(o.os);
488             if (result != 0) return -result;
489             long result2 = pop - o.pop;
490             if (result2 != 0) return result2 < 0 ? 1 : -1;
491             return name.compareTo(o.name);
492         }
493 
equals(Object o)494         public boolean equals(Object o) {
495             return 0 == compareTo((RowData) o);
496         }
497 
hashCode()498         public int hashCode() {
499             throw new UnsupportedOperationException();
500         }
501     }
502 
printDefaultLanguagesAndScripts()503     private static void printDefaultLanguagesAndScripts() {
504 
505         final int minTotalPopulation = 10000000;
506         final int minTerritoryPopulation = 1000000;
507         final double minTerritoryPercent = 1.0 / 3;
508         Map<String, Set<RowData>> languageToReason = new TreeMap<String, Set<RowData>>();
509         Counter<String> languageToLiteratePopulation = new Counter<String>();
510         NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH);
511         nf.setGroupingUsed(true);
512         LanguageTagParser ltp = new LanguageTagParser();
513         LikelySubtags likelySubtags = new LikelySubtags();
514         /*
515          * A. X is a qualified language**, and at least one of the following is true:
516          *
517          * 1. X is has official status* in any country
518          * 2. X exceeds a threshold population† of literate users worldwide: 1M
519          * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†.
520          *
521          * B. X is an exception explicitly approved by the committee or X has minimal
522          * language coverage‡ in CLDR itself.
523          */
524         OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official;
525         Map<String, String> languages = new TreeMap<String, String>();
526         for (String language : standardCodes.getAvailableCodes("language")) {
527             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
528             String result = english.getStringValue(path);
529             if (result != null) {
530                 languages.put(language, result);
531             }
532         }
533         for (String language : languages.keySet()) {
534             System.out.println(language + "\t" + languages.get(language));
535         }
536 
537         for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
538             PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory);
539             double territoryPopulation = territoryPop.getLiteratePopulation();
540             for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) {
541                 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript,
542                     territory);
543                 ltp.set(languageScript);
544                 String language = ltp.getLanguage();
545 //                if (ltp.getScript().isEmpty()) {
546 //                    String max = likelySubtags.maximize(languageScript);
547 //                    if (max != null) {
548 //                        ltp.set(max).setRegion("");
549 //                        languageScript = ltp.toString();
550 //                    }
551 //                }
552                 boolean add = false;
553                 // #1
554                 OfficialStatus status = popData.getOfficialStatus();
555                 if (status.compareTo(minimalStatus) >= 0) {
556                     add = true;
557                 }
558                 long literatePopulation = getWritingPopulation(popData);
559                 // #2
560                 languageToLiteratePopulation.add(language, literatePopulation);
561                 // #3
562                 if (literatePopulation > minTerritoryPopulation
563                     && literatePopulation > minTerritoryPercent * territoryPopulation) {
564                     add = true;
565                 }
566                 if (add) {
567                     add(languageToReason, language, territory, status, literatePopulation);
568                     // Add the containing regions
569                     for (String container : Containment.leafToContainer(territory)) {
570                         add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation);
571                     }
572                 }
573             }
574         }
575         // #2, now that we have the data
576         for (String language : languageToLiteratePopulation.keySet()) {
577             long totalPop = languageToLiteratePopulation.getCount(language);
578             if (totalPop > minTotalPopulation) {
579                 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop);
580             }
581         }
582 
583         // Specials
584         add(languageToReason, "und", "001", OfficialStatus.unknown, 0);
585 
586         // for (String language : Iso639Data.getAvailable()) {
587         // Scope scope = Iso639Data.getScope(language);
588         // Type type = Iso639Data.getType(language);
589         // if (scope == Scope.Special) {
590         // add(languageToReason, language, "001", OfficialStatus.unknown, -1);
591         // }
592         // }
593         // print them
594 
595         System.out.println("Detailed - Including:\t" + languageToReason.size());
596 
597         for (String language : languageToReason.keySet()) {
598             Set<RowData> reasons = languageToReason.get(language);
599 
600             RowData lastReason = reasons.iterator().next();
601 
602             System.out.append(language)
603                 .append("\t")
604                 .append(english.getName(language))
605                 .append("\t")
606                 .append(lastReason.getStatus().toShortString())
607                 .append("\t")
608                 .append(nf.format(languageToLiteratePopulation.getCount(language)));
609             for (RowData reason : reasons) {
610                 String status = reason.getStatus().toShortString();
611                 System.out.append("\t")
612                     .append(status)
613                     .append("-")
614                     .append(reason.getName())
615                     .append("-")
616                     .append(nf.format(reason.getLiteratePopulation()));
617             }
618             System.out.append("\n");
619         }
620 
621         // now list them
622 
623         Set<String> others = new TreeSet<String>();
624         others.addAll(standardCodes.getGoodAvailableCodes("language"));
625         others.removeAll(languageToReason.keySet());
626         System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size());
627         showLanguages(languageToReason.keySet(), languageToReason);
628         System.out.println("\nExcluded Languages:\t" + others.size());
629         showLanguages(others, languageToReason);
630     }
631 
getWritingPopulation(PopulationData popData)632     private static long getWritingPopulation(PopulationData popData) {
633         final double writingPopulation = popData.getWritingPopulation();
634         if (!Double.isNaN(writingPopulation)) {
635             return (long) writingPopulation;
636         }
637         return (long) popData.getLiteratePopulation();
638     }
639 
showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)640     private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) {
641         Set<String> sorted = new TreeSet<String>(Collator.getInstance(ULocale.ENGLISH));
642         for (String language : others) {
643             sorted.add(getLanguageName(language, languageToReason));
644         }
645         char last = 0;
646         for (String language : sorted) {
647             final char curr = language.charAt(0);
648             if (last != curr) {
649                 System.out.println();
650             } else if (last != '\u0000') {
651                 System.out.print(", ");
652             }
653             System.out.print(language);
654             last = curr;
655         }
656         System.out.println();
657     }
658 
getLanguageName(String language, Map<String, Set<RowData>> languageToReason)659     private static String getLanguageName(String language,
660         Map<String, Set<RowData>> languageToReason) {
661         OfficialStatus best = OfficialStatus.unknown;
662         Set<RowData> reasons = languageToReason.get(language);
663         if (reasons != null) {
664             for (RowData reason : reasons) {
665                 final OfficialStatus currentStatus = reason.getStatus();
666                 if (best.compareTo(currentStatus) < 0) {
667                     best = currentStatus;
668                 }
669             }
670         }
671         String status = best.toShortString();
672         Scope scope = Iso639Data.getScope(language);
673         if (scope == Scope.Special) {
674             status = "S";
675         }
676         String languageFormatted = english.getName(language) + " [" + language + "]-" + status;
677         return languageFormatted;
678     }
679 
add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)680     private static void add(Map<String, Set<RowData>> languageToReason, String language,
681         String territoryRaw, OfficialStatus status, long population) {
682         String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]";
683         Set<RowData> set = languageToReason.get(language);
684         if (set == null) {
685             languageToReason.put(language, set = new TreeSet<RowData>());
686         }
687         set.add(new RowData(status, territory, population));
688     }
689 
printDefaultContent(Map<String, String> toMaximized)690     private static void printDefaultContent(Map<String, String> toMaximized) throws IOException {
691 
692         Set<String> defaultLocaleContent = new TreeSet<String>();
693 
694         // go through all the cldr locales, and add default contents
695         // now computed from toMaximized
696         Set<String> available = factory.getAvailable();
697         Relation<String, String> toChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
698         LanguageTagParser ltp = new LanguageTagParser();
699 
700         // System.out.println(maximize("az_Latn_AZ", toMaximized));
701         Set<String> hasScript = new TreeSet<String>();
702 
703         // first get a mapping to children
704         for (String locale : available) {
705             if (locale.equals("root")) {
706                 continue;
707             }
708             if (ltp.set(locale).getVariants().size() != 0) {
709                 continue;
710             }
711             String parent = LocaleIDParser.getSimpleParent(locale);
712             if (ltp.getScript().length() != 0) {
713                 hasScript.add(parent);
714             }
715             if (parent.equals("root")) {
716                 continue;
717             }
718             toChildren.put(parent, locale);
719         }
720 
721         // Suppress script for locales for which we only have one locale in common/main. See ticket #7834.
722         Set<String> suppressScriptLocales = new HashSet<String>(Arrays.asList(
723             "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN",
724             "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE",
725             "blt_VN"));
726 
727         // if any have a script, then throw out any that don't have a script (unless they're specifically included.)
728         Set<String> toRemove = new TreeSet<String>();
729         for (String locale : hasScript) {
730             toRemove.clear();
731             Set<String> children = toChildren.getAll(locale);
732             for (String child : children) {
733                 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) {
734                     toRemove.add(child);
735                 }
736             }
737             if (toRemove.size() != 0) {
738                 System.out.println("Removing:\t" + locale + "\t" + toRemove + "\tfrom\t" + children);
739                 toChildren.removeAll(locale, toRemove);
740             }
741         }
742 
743         // we add a child as a default locale if it has the same maximization
744         main: for (String locale : toChildren.keySet()) {
745             String maximized = maximize(locale, toMaximized);
746             if (maximized == null) {
747                 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale);
748                 continue;
749             }
750             Set<String> children = toChildren.getAll(locale);
751             Map<String, String> debugStuff = new TreeMap<String, String>();
752             for (String child : children) {
753                 String maximizedChild = maximize(child, toMaximized);
754                 if (maximized.equals(maximizedChild)) {
755                     defaultLocaleContent.add(child);
756                     continue main;
757                 }
758                 debugStuff.put(child, maximizedChild);
759             }
760             if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized
761                 + "\tin\t" + debugStuff);
762         }
763 
764         defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale.
765 
766         showDefaultContentDifferencesAndFix(defaultLocaleContent);
767 
768         Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml");
769         BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml");
770         CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false);
771 
772         String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t";
773         String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep,
774             PatternCache.get("(\\S)\\S*").matcher(""), 80);
775 
776         Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
777         Log.println("\t\t/>");
778 
779         // Log.println("</supplementalData>");
780         CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching >
781         CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest
782 
783         Log.close();
784         oldFile.close();
785     }
786 
787     // private static void oldAlgorithm(Map<String,String> toMaximized) {
788     // Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales();
789     // LanguageTagParser parser = new LanguageTagParser();
790     // for (String locale : defaultContentLocales) {
791     // String parent = parser.getParent(locale);
792     // toMaximized.put(parent, locale);
793     // if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent");
794     // }
795     //
796     // for (String[] specialCase : SpecialCases) {
797     // toMaximized.put(specialCase[0], specialCase[1]);
798     // if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial");
799     // }
800     //
801     // // recurse and close
802     // closeMapping(toMaximized);
803     //
804     // addScript(toMaximized, parser);
805     //
806     // closeMapping(toMaximized);
807     //
808     // addLanguageScript(toMaximized, parser);
809     //
810     // closeMapping(toMaximized);
811     //
812     // addLanguageCountry(toMaximized, parser);
813     //
814     // closeMapping(toMaximized);
815     //
816     // addCountries(toMaximized);
817     // addScript(toMaximized, parser);
818     // closeMapping(toMaximized);
819     // closeUnd(toMaximized);
820     //
821     // addDeprecated(toMaximized);
822     //
823     // closeMapping(toMaximized);
824     //
825     // checkConsistency(toMaximized);
826     // }
827 
828     private static class MaxData {
829         Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
830         Map<String, Counter<String>> languagesToScripts = new TreeMap<String, Counter<String>>();
831         Map<String, Counter<String>> languagesToRegions = new TreeMap<String, Counter<String>>();
832 
833         Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
834         Map<String, Counter<String>> scriptsToLanguages = new TreeMap<String, Counter<String>>();
835         Map<String, Counter<String>> scriptsToRegions = new TreeMap<String, Counter<String>>();
836 
837         Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
838         Map<String, Counter<String>> regionsToLanguages = new TreeMap<String, Counter<String>>();
839         Map<String, Counter<String>> regionsToScripts = new TreeMap<String, Counter<String>>();
840 
841         Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<String, Counter<Row.R2<String, String>>>();
842         Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of(
843             new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class);
844 
845         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of(
846             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
847             TreeSet.class);
848         Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of(
849             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
850             TreeSet.class);
851         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of(
852             new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
853             TreeSet.class);
854 
855         /**
856          * Add population information. "order" is the negative of the population (makes the first be the highest).
857          * @param language
858          * @param script
859          * @param region
860          * @param order
861          */
add(String language, String script, String region, Double order)862         void add(String language, String script, String region, Double order) {
863             if (language.equals("cpp")) {
864                 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order);
865             }
866             languages.put(language, Row.of(order, script, region));
867             // addCounter(languagesToScripts, language, script, order);
868             // addCounter(languagesToRegions, language, region, order);
869 
870             scripts.put(script, Row.of(order, language, region));
871             // addCounter(scriptsToLanguages, script, language, order);
872             // addCounter(scriptsToRegions, script, region, order);
873 
874             regions.put(region, Row.of(order, language, script));
875             // addCounter(regionsToLanguages, region, language, order);
876             // addCounter(regionsToScripts, region, script, order);
877 
878             languageScripts.put(Row.of(language, script), Row.of(order, region));
879             scriptRegions.put(Row.of(script, region), Row.of(order, language));
880             languageRegions.put(Row.of(language, region), Row.of(order, script));
881 
882             Set<String> containerSet = Containment.leafToContainer(region);
883             if (containerSet != null) {
884                 for (String container : containerSet) {
885 
886                     containersToLangRegion.put(container, Row.of(order, language, script, region));
887                     Counter<R2<String, String>> data = containersToLanguage.get(container);
888                     if (data == null) {
889                         containersToLanguage.put(container, data = new Counter<R2<String, String>>());
890                     }
891                     data.add(Row.of(language, script), (long) (double) order);
892 
893                 }
894             }
895 
896             if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order);
897         }
898         // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) {
899         // Counter<String> counter = map.get(key);
900         // if (counter == null) {
901         // map.put(key, counter = new Counter<String>());
902         // }
903         // counter.add(key2, count.longValue());
904         // }
905     }
906 
907     private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000;
908     private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20;
909     private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000;
910     private static final double UNOFFICIAL_SCALE_DOWN = 0.2;
911 
912     private static NumberFormat percent = NumberFormat.getPercentInstance();
913     private static NumberFormat number = NumberFormat.getIntegerInstance();
914 
tryDifferentAlgorithm(Map<String, String> toMaximized)915     private static void tryDifferentAlgorithm(Map<String, String> toMaximized) {
916         // we are going to try a different approach.
917         // first gather counts for maximized values
918         // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap();
919         MaxData maxData = new MaxData();
920         Set<String> cldrLocales = factory.getAvailable();
921         Set<String> otherTerritories = new TreeSet<String>(standardCodes.getGoodAvailableCodes("territory"));
922 
923         // process all the information to get the top values for each triple.
924         // each of the combinations of 1 or 2 components gets to be a key.
925         for (String region : supplementalData.getTerritoriesWithPopulationData()) {
926             otherTerritories.remove(region);
927             PopulationData regionData = supplementalData.getPopulationDataForTerritory(region);
928             final double literateTerritoryPopulation = regionData.getLiteratePopulation();
929             // we need any unofficial language to meet a certain absolute size requirement and proportion size
930             // requirement.
931             // so the bar is x percent of the population, reset up to y absolute size.
932             double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION;
933             if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) {
934                 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE;
935             }
936 
937             for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) {
938                 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region);
939                 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation();
940                 double order = -literatePopulation; // negative so we get the inverse order
941 
942                 if (data.getOfficialStatus() == OfficialStatus.unknown) {
943                     final String locale = writtenLanguage + "_" + region;
944                     if (literatePopulation >= minimalLiteratePopulation) {
945                         // ok, skip
946                     } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) {
947                         // ok, skip
948                     } else {
949                         // if (SHOW_ADD)
950                         // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t"
951                         // + english.getName(locale)
952                         // + "\t-- too small:\t" + number.format(literatePopulation));
953                         // continue;
954                     }
955                     order *= UNOFFICIAL_SCALE_DOWN;
956                     if (SHOW_ADD)
957                         System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t"
958                             + english.getName(locale)
959                             + "\t" + number.format(literatePopulation)
960                             + "\t" + percent.format(literatePopulation / literateTerritoryPopulation)
961                             + (cldrLocales.contains(locale) ? "\tin-CLDR" : ""));
962                 }
963                 String script;
964                 String language = writtenLanguage;
965                 final int pos = writtenLanguage.indexOf('_');
966                 if (pos > 0) {
967                     language = writtenLanguage.substring(0, pos);
968                     script = writtenLanguage.substring(pos + 1);
969                 } else {
970                     script = getScriptForLocale2(language);
971                 }
972                 maxData.add(language, script, region, order);
973             }
974         }
975 
976         LanguageTagParser additionLtp = new LanguageTagParser();
977 
978         for (String addition : MAX_ADDITIONS) {
979             additionLtp.set(addition);
980             String lan = additionLtp.getLanguage();
981             Set<R3<Double, String, String>> key = maxData.languages.get(lan);
982             if (key == null) {
983                 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0);
984             } else {
985                 int debug = 0;
986             }
987         }
988 
989         for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) {
990             String language = entry.getKey();
991             final Collection<String> values = entry.getValue();
992             if (values.size() != 1) {
993                 continue; // skip, no either way
994             }
995             Set<R3<Double, String, String>> old = maxData.languages.get(language);
996             if (!maxData.languages.containsKey(language)) {
997                 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0);
998             }
999         }
1000 
1001         // add others, with English default
1002         for (String region : otherTerritories) {
1003             if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS
1004             maxData.add("en", "Latn", region, 1.0);
1005         }
1006 
1007         // get a reverse mapping, so that we can add the aliases
1008 
1009         Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo()
1010             .get("language");
1011         for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) {
1012             String reason = str.getValue().get1();
1013             if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) {
1014                 continue;
1015             }
1016             List<String> replacements = str.getValue().get0();
1017             if (replacements == null) {
1018                 continue;
1019             }
1020             String goodLanguage = replacements.get(0);
1021 
1022             String badLanguage = str.getKey();
1023             if (badLanguage.contains("_")) {
1024                 continue;
1025             }
1026             if (deprecatedISONotInLST.contains(badLanguage)) {
1027                 continue;
1028             }
1029             Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage);
1030             if (goodLanguageData == null) {
1031                 continue;
1032             }
1033             R3<Double, String, String> value = goodLanguageData.iterator().next();
1034             final String script = value.get1();
1035             final String region = value.get2();
1036             maxData.add(badLanguage, script, region, 1.0);
1037             System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason);
1038         }
1039 
1040         // now, get the best for each one
1041         for (String language : maxData.languages.keySet()) {
1042             R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next();
1043             final Comparable<String> script = value.get1();
1044             final Comparable<String> region = value.get2();
1045             add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", Override.REPLACE_EXISTING,
1046                 SHOW_ADD);
1047         }
1048         for (String language : maxData.languagesToScripts.keySet()) {
1049             String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next();
1050             add(language, language + "_" + script, toMaximized, "L->S", Override.REPLACE_EXISTING, SHOW_ADD);
1051         }
1052         for (String language : maxData.languagesToRegions.keySet()) {
1053             String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next();
1054             add(language, language + "_" + region, toMaximized, "L->R", Override.REPLACE_EXISTING, SHOW_ADD);
1055         }
1056 
1057         for (String script : maxData.scripts.keySet()) {
1058             R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next();
1059             final Comparable<String> language = value.get1();
1060             final Comparable<String> region = value.get2();
1061             add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR",
1062                 Override.REPLACE_EXISTING, SHOW_ADD);
1063         }
1064         for (String script : maxData.scriptsToLanguages.keySet()) {
1065             String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next();
1066             add("und_" + script, language + "_" + script, toMaximized, "S->L", Override.REPLACE_EXISTING, SHOW_ADD);
1067         }
1068         for (String script : maxData.scriptsToRegions.keySet()) {
1069             String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next();
1070             add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", Override.REPLACE_EXISTING,
1071                 SHOW_ADD);
1072         }
1073 
1074         for (String region : maxData.regions.keySet()) {
1075             R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next();
1076             final Comparable<String> language = value.get1();
1077             final Comparable<String> script = value.get2();
1078             add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS",
1079                 Override.REPLACE_EXISTING, SHOW_ADD);
1080         }
1081         for (String region : maxData.regionsToLanguages.keySet()) {
1082             String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next();
1083             add("und_" + region, language + "_" + region, toMaximized, "R->L", Override.REPLACE_EXISTING, SHOW_ADD);
1084         }
1085         for (String region : maxData.regionsToScripts.keySet()) {
1086             String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next();
1087             add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", Override.REPLACE_EXISTING,
1088                 SHOW_ADD);
1089         }
1090 
1091         for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) {
1092             String region = containerAndInfo.getKey();
1093             if (region.equals("001")) {
1094                 continue;
1095             }
1096             Counter<R2<String, String>> data = containerAndInfo.getValue();
1097             Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true);
1098             if (SHOW_CONTAINERS) { // debug
1099                 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null)));
1100                 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region));
1101             }
1102             R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative
1103             final Comparable<String> language = value.get0();
1104             final Comparable<String> script = value.get1();
1105 
1106             // fix special cases like es-419, where a locale exists.
1107             // for those cases, what we add as output is the container. Otherwise the region.
1108             Set<String> skipLanguages = cldrContainerToLanguages.get(region);
1109             if (skipLanguages != null
1110                 && skipLanguages.contains(language)) {
1111                 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS",
1112                     Override.REPLACE_EXISTING, SHOW_ADD);
1113                 continue;
1114             }
1115 
1116             // we now have the best language and script. Find the best region for that
1117             for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) {
1118                 final Comparable<String> language2 = e.get1();
1119                 final Comparable<String> script2 = e.get2();
1120                 if (language2.equals(language) && script2.equals(script)) {
1121                     add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS",
1122                         Override.REPLACE_EXISTING, SHOW_ADD);
1123                     break;
1124                 }
1125             }
1126         }
1127 
1128         for (R2<String, String> languageScript : maxData.languageScripts.keySet()) {
1129             R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next();
1130             final Comparable<String> language = languageScript.get0();
1131             final Comparable<String> script = languageScript.get1();
1132             final Comparable<String> region = value.get1();
1133             add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R",
1134                 Override.REPLACE_EXISTING, SHOW_ADD);
1135         }
1136 
1137         for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) {
1138             R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next();
1139             final Comparable<String> script = scriptRegion.get0();
1140             final Comparable<String> region = scriptRegion.get1();
1141             final Comparable<String> language = value.get1();
1142             add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L",
1143                 Override.REPLACE_EXISTING, SHOW_ADD);
1144         }
1145 
1146         for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) {
1147             R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next();
1148             final Comparable<String> language = languageRegion.get0();
1149             final Comparable<String> region = languageRegion.get1();
1150             final Comparable<String> script = value.get1();
1151             add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S",
1152                 Override.REPLACE_EXISTING, SHOW_ADD);
1153         }
1154 
1155         // get the script info from metadata as fallback
1156 
1157         TreeSet<String> sorted = new TreeSet<String>(ScriptMetadata.getScripts());
1158         for (String script : sorted) {
1159             Info i = ScriptMetadata.getInfo(script);
1160             String likelyLanguage = i.likelyLanguage;
1161             String originCountry = i.originCountry;
1162             final String result = likelyLanguage + "_" + script + "_" + originCountry;
1163             add("und_" + script, result, toMaximized, "S->LR•",
1164                 Override.KEEP_EXISTING, SHOW_ADD);
1165             add(likelyLanguage, result, toMaximized, "L->SR•",
1166                 Override.KEEP_EXISTING, SHOW_ADD);
1167         }
1168 
1169         // add overrides
1170         for (String key : LANGUAGE_OVERRIDES.keySet()) {
1171             add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", Override.REPLACE_EXISTING, true);
1172         }
1173     }
1174 
shorten(Object data)1175     public static String shorten(Object data) {
1176         String info = data.toString();
1177         if (info.length() > 255) {
1178             info = info.substring(0, 127) + "…";
1179         }
1180         return info;
1181     }
1182 
doAlt(Map<String, String> toMaximized)1183     private static void doAlt(Map<String, String> toMaximized) {
1184         // TODO Auto-generated method stub
1185         Map<String, String> temp = new TreeMap<String, String>();
1186         for (String locale : toMaximized.keySet()) {
1187             String target = toMaximized.get(locale);
1188             temp.put(toAlt(locale, true), toAlt(target, true));
1189         }
1190         toMaximized.clear();
1191         toMaximized.putAll(temp);
1192     }
1193 
maximize(String languageTag, Map<String, String> toMaximized)1194     public static String maximize(String languageTag, Map<String, String> toMaximized) {
1195         LanguageTagParser ltp = new LanguageTagParser();
1196 
1197         // clean up the input by removing Zzzz, ZZ, and changing "" into und.
1198         ltp.set(languageTag);
1199         String language = ltp.getLanguage();
1200         String region = ltp.getRegion();
1201         String script = ltp.getScript();
1202         boolean changed = false;
1203         if (language.equals("")) {
1204             ltp.setLanguage(language = "und");
1205             changed = true;
1206         }
1207         if (region.equals(UNKNOWN_SCRIPT)) {
1208             ltp.setScript(script = "");
1209             changed = true;
1210         }
1211         if (ltp.getRegion().equals(UNKNOWN_REGION)) {
1212             ltp.setRegion(region = "");
1213             changed = true;
1214         }
1215         if (changed) {
1216             languageTag = ltp.toString();
1217         }
1218         // check whole
1219         String result = toMaximized.get(languageTag);
1220         if (result != null) {
1221             return result;
1222         }
1223         // try empty region
1224         if (region.length() != 0) {
1225             result = toMaximized.get(ltp.setRegion("").toString());
1226             if (result != null) {
1227                 return ltp.set(result).setRegion(region).toString();
1228             }
1229             ltp.setRegion(region); // restore
1230         }
1231         // try empty script
1232         if (script.length() != 0) {
1233             result = toMaximized.get(ltp.setScript("").toString());
1234             if (result != null) {
1235                 return ltp.set(result).setScript(script).toString();
1236             }
1237             // try empty script and region
1238             if (region.length() != 0) {
1239                 result = toMaximized.get(ltp.setRegion("").toString());
1240                 if (result != null) {
1241                     return ltp.set(result).setScript(script).setRegion(region).toString();
1242                 }
1243             }
1244         }
1245         if (!language.equals("und") && script.length() != 0 && region.length() != 0) {
1246             return languageTag; // it was ok, and we couldn't do anything with it
1247         }
1248         return null; // couldn't maximize
1249     }
1250 
minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1251     public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) {
1252         if (input.equals("nb_Latn_SJ")) {
1253             System.out.print(""); // debug
1254         }
1255         String maximized = maximize(input, toMaximized);
1256         if (maximized == null) {
1257             return null; // failed
1258         }
1259         LanguageTagParser ltp = new LanguageTagParser().set(maximized);
1260         String language = ltp.getLanguage();
1261         String region = ltp.getRegion();
1262         String script = ltp.getScript();
1263         // try building up from shorter to longer, and find the first that matches
1264         // could be more optimized, but for this code we want simplest
1265         String[] trials = { language,
1266             language + TAG_SEPARATOR + (favorRegion ? region : script),
1267             language + TAG_SEPARATOR + (!favorRegion ? region : script) };
1268         for (String trial : trials) {
1269             String newMaximized = maximize(trial, toMaximized);
1270             if (maximized.equals(newMaximized)) {
1271                 return trial;
1272             }
1273         }
1274         return maximized;
1275     }
1276 
1277     // /**
1278     // * Verify that we can map from each language, script, and country to something.
1279     // * @param toMaximized
1280     // */
1281     // private static void checkConsistency(Map<String, String> toMaximized) {
1282     // Map<String,String> needMappings = new TreeMap();
1283     // LanguageTagParser parser = new LanguageTagParser();
1284     // for (String maximized : new TreeSet<String>(toMaximized.values())) {
1285     // parser.set(maximized);
1286     // final String language = parser.getLanguage();
1287     // final String script = parser.getScript();
1288     // final String region = parser.getRegion();
1289     // if (language.length() == 0 || script.length() == 0 || region.length() == 0) {
1290     // failure("   { \"" + maximized + "\", \"" + maximized + "\" },   //     " + english.getName(maximized) +
1291     // "\t\tFailed-Consistency");
1292     // continue;
1293     // }
1294     // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency");
1295     // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency");
1296     // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1297     // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency");
1298     // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1299     // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency");
1300     // }
1301     // toMaximized.putAll(needMappings);
1302     // }
1303 
1304     // private static void failure(String string) {
1305     // System.out.println(string);
1306     // errorCount++;
1307     // }
1308 
1309     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String>
1310     // otherToCheck, String kind) {
1311     // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind);
1312     // }
1313 
1314     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey,
1315     // Set<String> skipValue, String kind) {
1316     // if (!key.equals(value)
1317     // && !toAdd.containsKey(key)
1318     // && (skipKey == null || !skipKey.contains(key))
1319     // && (skipValue == null || !skipValue.contains(value))) {
1320     // add(key, value, toAdd, kind);
1321     // }
1322     // }
1323 
1324     enum Override {
1325         KEEP_EXISTING, REPLACE_EXISTING
1326     }
1327 
add(String key, String value, Map<String, String> toAdd, String kind, Override override, boolean showAction)1328     private static void add(String key, String value, Map<String, String> toAdd, String kind, Override override,
1329         boolean showAction) {
1330         if (key.equals(DEBUG_ADD_KEY)) {
1331             System.out.println("*debug*");
1332         }
1333         String oldValue = toAdd.get(key);
1334         if (oldValue == null) {
1335             if (showAction) {
1336                 System.out.println("Adding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind);
1337             }
1338         } else if (override == Override.KEEP_EXISTING || value.equals(oldValue)) {
1339             // if (showAction) {
1340             // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind);
1341             // }
1342             return;
1343         } else {
1344             if (showAction) {
1345                 System.out.println("Replacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind);
1346             }
1347         }
1348         toAdd.put(key, value);
1349     }
1350 
getName(String value)1351     private static String getName(String value) {
1352         return ConvertLanguageData.getLanguageCodeAndName(value);
1353     }
1354 
1355     // private static void addCountries(Map<String, String> toMaximized) {
1356     // Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap();
1357     //
1358     // for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
1359     // Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory);
1360     // String biggestOfficial = null;
1361     // double biggest = -1;
1362     // for (String language : languages) {
1363     // PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory);
1364     // // add to info about script
1365     //
1366     // String script = getScriptForLocale(language);
1367     // if (script != null) {
1368     // Map<String, Double> languageInfo = scriptToLanguageToSize.get(script);
1369     // if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap());
1370     // String baseLanguage = language;
1371     // int pos = baseLanguage.indexOf('_');
1372     // if (pos >= 0) {
1373     // baseLanguage = baseLanguage.substring(0,pos);
1374     // }
1375     // Double size = languageInfo.get(baseLanguage);
1376     // languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation());
1377     // }
1378     //
1379     //
1380     // final OfficialStatus officialStatus = info.getOfficialStatus();
1381     // if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) {
1382     // double size2 = info.getLiteratePopulation();
1383     // if (biggest < size2) {
1384     // biggest = size2;
1385     // biggestOfficial = language;
1386     // }
1387     // }
1388     // }
1389     // if (biggestOfficial != null) {
1390     // final String replacementTag = "und_" + territory;
1391     // String maximized = biggestOfficial + "_" + territory;
1392     // toMaximized.put(replacementTag, maximized);
1393     // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory");
1394     // }
1395     // }
1396     //
1397     // for (String script : scriptToLanguageToSize.keySet()) {
1398     // String biggestOfficial = null;
1399     // double biggest = -1;
1400     //
1401     // final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script);
1402     // for (String language : languageToSize.keySet()) {
1403     // double size = languageToSize.get(language);
1404     // if (biggest < size) {
1405     // biggest = size;
1406     // biggestOfficial = language;
1407     // }
1408     // }
1409     // if (biggestOfficial != null) {
1410     // final String replacementTag = "und_" + script;
1411     // String maximized = biggestOfficial + "_" + script;
1412     // toMaximized.put(replacementTag, maximized);
1413     // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script");
1414     // }
1415     // }
1416     // }
1417 
1418     // private static void closeUnd(Map<String, String> toMaximized) {
1419     // Map<String,String> toAdd = new TreeMap<String,String>();
1420     // for (String oldSource : toMaximized.keySet()) {
1421     // String maximized = toMaximized.get(oldSource);
1422     // if (!maximized.startsWith("und")) {
1423     // int pos = maximized.indexOf("_");
1424     // if (pos >= 0) {
1425     // addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd");
1426     // }
1427     // }
1428     // }
1429     // toMaximized.putAll(toAdd);
1430     // }
1431 
1432     /**
1433      * Generate tags where the deprecated values map to the expanded values
1434      *
1435      * @param toMaximized
1436      */
1437     // private static void addDeprecated(Map<String, String> toMaximized) {
1438     // Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo();
1439     // LanguageTagParser temp = new LanguageTagParser();
1440     // LanguageTagParser tagParsed = new LanguageTagParser();
1441     // LanguageTagParser replacementParsed = new LanguageTagParser();
1442     // Map<String,String> toAdd = new TreeMap<String,String>();
1443     // while (true) {
1444     // toAdd.clear();
1445     // for (String type : typeToTagToReplacement.keySet()) {
1446     // if (type.equals("variant") || type.equals("zone")) continue;
1447     // boolean addUnd = !type.equals("language");
1448     //
1449     // Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type);
1450     // System.out.println("*" + type + " = " + tagToReplacement);
1451     //
1452     // for (String tag: tagToReplacement.keySet()) {
1453     //
1454     // final List<String> list = tagToReplacement.get(tag);
1455     // if (list == null) continue; // we don't have any information
1456     // String replacement = list.get(0);
1457     //
1458     // // only do multiples
1459     // if (tag.contains("_") || !replacement.contains("_")) {
1460     // continue;
1461     // }
1462     //
1463     // // we now have a tag and a replacement value
1464     // // make parsers that we can use
1465     // try {
1466     // tagParsed.set(addUnd ? "und-" + tag : tag);
1467     // replacementParsed.set(addUnd ? "und-" + replacement : replacement);
1468     // } catch (RuntimeException e) {
1469     // continue;
1470     // }
1471     // addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated");
1472     //
1473     // for (String locale : toMaximized.keySet()) {
1474     // String maximized = toMaximized.get(locale);
1475     // addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized);
1476     // addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized);
1477     // }
1478     // }
1479     // }
1480     // if (toAdd.size() == 0) {
1481     // break;
1482     // }
1483     // toMaximized.putAll(toAdd);
1484     // }
1485     // }
1486 
1487     // private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed,
1488     // LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) {
1489     // if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) {
1490     // return;
1491     // }
1492     // if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) {
1493     // return;
1494     // }
1495     // if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) {
1496     // return;
1497     // }
1498     // if (!replacementParsed.getLanguage().equals("und")) {
1499     // locale.setLanguage(replacementParsed.getLanguage());
1500     // }
1501     // if (!replacementParsed.getScript().equals("")) {
1502     // locale.setScript(replacementParsed.getScript());
1503     // }
1504     // if (!replacementParsed.getRegion().equals("")) {
1505     // locale.setRegion(replacementParsed.getRegion());
1506     // }
1507     // addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated");
1508     // }
1509 
1510     // private static int getSubtagPosition(String locale, String subtags) {
1511     // int pos = -1;
1512     // while (true) {
1513     // pos = locale.indexOf(subtags, pos + 1);
1514     // if (pos < 0) return -1;
1515     // // make sure boundaries are ok
1516     // if (pos != 0) {
1517     // char charBefore = locale.charAt(pos-1);
1518     // if (charBefore != '_' && charBefore != '_') return -1;
1519     // }
1520     // int limit = pos + subtags.length();
1521     // if (limit != locale.length()) {
1522     // char charAfter = locale.charAt(limit);
1523     // if (charAfter != '_' && charAfter != '_') return -1;
1524     // }
1525     // return pos;
1526     // }
1527     // }
1528 
1529     /*
1530      * Format
1531      * const DefaultSubtags default_subtags[] = {
1532      * {
1533      * // Afar => Afar (Latin, Ethiopia)
1534      * "aa",
1535      * "aa_Latn_ET"
1536      * },{
1537      * // Afrikaans => Afrikaans (Latin, South Africa)
1538      * "af",
1539      * "af_Latn_ZA"
1540      * },{
1541      */
1542 
printLikelySubtags(Map<String, String> fluffup)1543     private static void printLikelySubtags(Map<String, String> fluffup) throws IOException {
1544 
1545         PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY,
1546             "/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt"));
1547         String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " ";
1548         String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {"
1549             : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR
1550                 + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">"
1551                 + CldrUtility.LINE_SEPARATOR
1552                 + "<!--"
1553                 + CldrUtility.LINE_SEPARATOR
1554                 + CldrUtility.getCopyrightString()
1555                 + CldrUtility.LINE_SEPARATOR
1556                 + "-->"
1557                 + CldrUtility.LINE_SEPARATOR
1558                 + "<!--"
1559                 + CldrUtility.LINE_SEPARATOR
1560                 + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR
1561                 + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR
1562                 + "not be patched by hand, as any changes made in that fashion may be lost."
1563                 + CldrUtility.LINE_SEPARATOR
1564                 + "-->"
1565                 + CldrUtility.LINE_SEPARATOR
1566                 + "<supplementalData>" + CldrUtility.LINE_SEPARATOR
1567                 + "    <version number=\"$" +
1568                 "Revision$\"/>" + CldrUtility.LINE_SEPARATOR
1569                 + "    <likelySubtags>";
1570         String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};"
1571             : "    </likelySubtags>" + CldrUtility.LINE_SEPARATOR
1572                 + "</supplementalData>";
1573         out.println(header);
1574         boolean first = true;
1575         Set<String> keys = new TreeSet<String>(new LocaleStringComparator());
1576         keys.addAll(fluffup.keySet());
1577         for (String printingLocale : keys) {
1578             String printingTarget = fluffup.get(printingLocale);
1579             String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing
1580                 + printingName(printingTarget, spacing);
1581 
1582             if (OUTPUT_STYLE == OutputStyle.XML) {
1583                 out.println("\t\t<likelySubtag from=\"" + printingLocale +
1584                     "\" to=\"" + printingTarget + "\"" +
1585                     "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->");
1586             } else {
1587                 if (first) {
1588                     first = false;
1589                 } else {
1590                     out.print(",");
1591                 }
1592                 if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) {
1593                     comment = printingName(printingLocale, spacing) + SEPARATOR + "    // " + spacing + "=>" + spacing
1594                         + printingName(printingTarget, spacing);
1595                 }
1596                 out.print(
1597                     "  {"
1598                         + SEPARATOR + "    // " + comment
1599                         + SEPARATOR + "    \"" + printingLocale + "\","
1600                         + SEPARATOR + "    \"" + printingTarget + "\""
1601                         + CldrUtility.LINE_SEPARATOR + "  }");
1602             }
1603         }
1604         out.println(footer);
1605         out.close();
1606     }
1607 
printingName(String locale, String spacing)1608     public static String printingName(String locale, String spacing) {
1609         if (locale == null) {
1610             return null;
1611         }
1612         LanguageTagParser parser = new LanguageTagParser().set(locale);
1613         String lang = parser.getLanguage();
1614         String script = parser.getScript();
1615         String region = parser.getRegion();
1616         return "{" + spacing +
1617             (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing +
1618             (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing
1619             +
1620             (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing
1621             + "}";
1622     }
1623 
1624     private static final String[][] ALT_REVERSAL = {
1625         { "nb", "no" },
1626         { "no", "nb" },
1627         { "he", "iw" },
1628         { "iw", "he" },
1629     };
1630 
toAlt(String locale, boolean change)1631     public static String toAlt(String locale, boolean change) {
1632         if (!change || locale == null) {
1633             return locale;
1634         }
1635         String firstTag = getFirstTag(locale);
1636         for (String[] pair : ALT_REVERSAL) {
1637             if (firstTag.equals(pair[0])) {
1638                 locale = pair[1] + locale.substring(pair[1].length());
1639                 break;
1640             }
1641         }
1642         locale = locale.replace("_", "-");
1643         return locale;
1644     }
1645 
getFirstTag(String locale)1646     private static String getFirstTag(String locale) {
1647         int pos = locale.indexOf('_');
1648         return pos < 0 ? locale : locale.substring(0, pos);
1649     }
1650 
1651     // private static Map<String, String> getBackMapping(Map<String, String> fluffup) {
1652     // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR);
1653     // for (String source : fluffup.keySet()) {
1654     // if (source.startsWith("und")) {
1655     // continue;
1656     // }
1657     // String maximized = fluffup.get(source);
1658     // backMap.put(maximized, source); // put in right order
1659     // }
1660     // Map<String,String> returnBackMap = new TreeMap();
1661     // for (String maximized : backMap.keySet()) {
1662     // final Set<String> all = backMap.getAll(maximized);
1663     // final String minimized = all.iterator().next();
1664     // returnBackMap.put(maximized, minimized);
1665     // }
1666     // return returnBackMap;
1667     // }
1668 
1669     /**
1670      * Language tags are presumed to share the first language, except possibly "und". Best is least
1671      */
1672     // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() {
1673     // LanguageTagParser p1 = new LanguageTagParser();
1674     // LanguageTagParser p2 = new LanguageTagParser();
1675     // public int compare(String o1, String o2) {
1676     // if (o1.equals(o2)) return 0;
1677     // p1.set(o1);
1678     // p2.set(o2);
1679     // String lang1 = p1.getLanguage();
1680     // String lang2 = p2.getLanguage();
1681     //
1682     // // compare languages first
1683     // // put und at the end
1684     // int result = lang1.compareTo(lang2);
1685     // if (result != 0) {
1686     // if (lang1.equals("und")) return 1;
1687     // if (lang2.equals("und")) return -1;
1688     // return result;
1689     // }
1690     //
1691     // // now scripts and regions.
1692     // // if they have different numbers of fields, the shorter wins.
1693     // // If there are two fields, region is lowest.
1694     // // The simplest way is to just compare scripts first
1695     // // so zh-TW < zh-Hant, because we first compare "" to Hant
1696     // String script1 = p1.getScript();
1697     // String script2 = p2.getScript();
1698     // int scriptOrder = script1.compareTo(script2);
1699     // if (scriptOrder != 0) return scriptOrder;
1700     //
1701     // String region1 = p1.getRegion();
1702     // String region2 = p2.getRegion();
1703     // int regionOrder = region1.compareTo(region2);
1704     // if (regionOrder != 0) return regionOrder;
1705     //
1706     // return o1.compareTo(o2);
1707     // }
1708     //
1709     // };
1710 
minimize(Map<String, String> fluffup)1711     public static void minimize(Map<String, String> fluffup) {
1712         LanguageTagParser parser = new LanguageTagParser();
1713         LanguageTagParser targetParser = new LanguageTagParser();
1714         Set<String> removals = new TreeSet<String>();
1715         while (true) {
1716             removals.clear();
1717             for (String locale : fluffup.keySet()) {
1718                 String target = fluffup.get(locale);
1719                 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) {
1720                     removals.add(locale);
1721                     if (SHOW_ADD)
1722                         System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1723                             + "\t\t - Unknown Region in target");
1724                     continue;
1725                 }
1726                 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) {
1727                     removals.add(locale);
1728                     if (SHOW_ADD)
1729                         System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1730                             + "\t\t - Unknown Script in target");
1731                     continue;
1732                 }
1733 
1734                 String region = parser.set(locale).getRegion();
1735                 if (region.length() != 0) {
1736                     if (region.equals(UNKNOWN_REGION)) {
1737                         removals.add(locale);
1738                         if (SHOW_ADD)
1739                             System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target)
1740                                 + "\t\t - Unknown Region in source");
1741                         continue;
1742                     }
1743                     parser.setRegion("");
1744                     String newLocale = parser.toString();
1745                     String newTarget = fluffup.get(newLocale);
1746                     if (newTarget != null) {
1747                         newTarget = targetParser.set(newTarget).setRegion(region).toString();
1748                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
1749                             removals.add(locale);
1750                             if (SHOW_ADD)
1751                                 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
1752                                     + newLocale);
1753                             continue;
1754                         }
1755                     }
1756                 }
1757                 String script = parser.set(locale).getScript();
1758                 if (locale.equals(DEBUG_ADD_KEY)) {
1759                     System.out.println("*debug*");
1760                 }
1761                 if (script.length() != 0) {
1762                     if (script.equals(UNKNOWN_SCRIPT)) {
1763                         removals.add(locale);
1764                         if (SHOW_ADD)
1765                             System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script");
1766                         continue;
1767                     }
1768                     parser.setScript("");
1769                     String newLocale = parser.toString();
1770                     String newTarget = fluffup.get(newLocale);
1771                     if (newTarget != null) {
1772                         newTarget = targetParser.set(newTarget).setScript(script).toString();
1773                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
1774                             removals.add(locale);
1775                             if (SHOW_ADD)
1776                                 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with "
1777                                     + newLocale);
1778                             continue;
1779                         }
1780                     }
1781                 }
1782             }
1783             if (removals.size() == 0) {
1784                 break;
1785             }
1786             for (String locale : removals) {
1787                 fluffup.remove(locale);
1788             }
1789         }
1790     }
1791 
1792     // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) {
1793     // // add script
1794     // Map<String, String> temp = new TreeMap<String, String>();
1795     // while (true) {
1796     // temp.clear();
1797     // for (String target : new TreeSet<String>(fluffup.values())) {
1798     // parser.set(target);
1799     // final String territory = parser.getRegion();
1800     // if (territory.length() == 0) {
1801     // continue;
1802     // }
1803     // parser.setRegion("");
1804     // String possibleSource = parser.toString();
1805     // if (fluffup.containsKey(possibleSource)) {
1806     // continue;
1807     // }
1808     // String other = temp.get(possibleSource);
1809     // if (other != null) {
1810     // if (!target.equals(other)) {
1811     // System.out.println("**Failure with multiple sources in addLanguageScript: "
1812     // + possibleSource + "\t=>\t" + target + ", " + other);
1813     // }
1814     // continue;
1815     // }
1816     // temp.put(possibleSource, target);
1817     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script");
1818     // }
1819     // if (temp.size() == 0) {
1820     // break;
1821     // }
1822     // fluffup.putAll(temp);
1823     // }
1824     //
1825     // }
1826 
1827     // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) {
1828     // // add script
1829     // Map<String, String> temp = new TreeMap<String, String>();
1830     // while (true) {
1831     // temp.clear();
1832     // for (String target : new TreeSet<String>(fluffup.values())) {
1833     // parser.set(target);
1834     // String script = parser.getScript();
1835     // if (script.length() == 0) {
1836     // continue;
1837     // }
1838     // parser.setScript("");
1839     // String possibleSource = parser.toString();
1840     // if (fluffup.containsKey(possibleSource)) {
1841     // continue;
1842     // }
1843     // String other = temp.get(possibleSource);
1844     //
1845     // if (other != null) {
1846     // if (!target.equals(other)) {
1847     // script = getScriptForLocale(possibleSource);
1848     // if (script == null) {
1849     // System.out.println("**Failure with multiple sources in addLanguageCountry: "
1850     // + possibleSource + "\t=>\t" + target + ", " + other);
1851     // continue; // error message in routine
1852     // }
1853     // parser.setScript(script);
1854     // target = parser.toString();
1855     // }
1856     // }
1857     //
1858     // temp.put(possibleSource, target);
1859     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry");
1860     // }
1861     // if (temp.size() == 0) {
1862     // break;
1863     // }
1864     // fluffup.putAll(temp);
1865     // }
1866     //
1867     // }
1868 
1869     // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) {
1870     // // add script
1871     // Map<String, String> temp = new TreeMap<String, String>();
1872     // while (true) {
1873     // temp.clear();
1874     // Set skipTarget = fluffup.keySet();
1875     // for (String locale : fluffup.keySet()) {
1876     // String target = fluffup.get(locale);
1877     // parser.set(target);
1878     // if (parser.getScript().length() != 0) {
1879     // continue;
1880     // }
1881     // String script = getScriptForLocale(target);
1882     //
1883     // if (script == null) {
1884     // continue; // error message in routine
1885     // }
1886     // parser.setScript(script);
1887     // String furtherTarget = parser.toString();
1888     // addIfNotIn(target, furtherTarget, temp, fluffup, "Script");
1889     // }
1890     // if (temp.size() == 0) {
1891     // break;
1892     // }
1893     // fluffup.putAll(temp);
1894     // }
1895     // }
1896 
1897     // private static String getScriptForLocale(String locale) {
1898     // String result = getScriptForLocale2(locale);
1899     // if (result != null) return result;
1900     // int pos = locale.indexOf('_');
1901     // if (pos >= 0) {
1902     // result = getScriptForLocale2(locale.substring(0,pos));
1903     // }
1904     // return result;
1905     // }
1906 
1907     private static String UNKNOWN_SCRIPT = "Zzzz";
1908     private static String UNKNOWN_REGION = "ZZ";
1909 
getScriptForLocale2(String locale)1910     private static String getScriptForLocale2(String locale) {
1911         String result = localeToScriptCache.get(locale);
1912         if (result != null) {
1913             return result;
1914         }
1915         if (locale.equals("ky")) {
1916             int debug = 0;
1917         }
1918         try {
1919             Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale);
1920             if (data != null) {
1921                 for (BasicLanguageData datum : data.values()) {
1922                     final Set<String> scripts = datum.getScripts();
1923                     boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary;
1924                     if (scripts.size() != 1) {
1925                         if (scripts.size() > 1 && isPrimary) {
1926                             break;
1927                         }
1928                         continue;
1929                     }
1930                     String script = scripts.iterator().next();
1931                     if (isPrimary) {
1932                         return result = script;
1933                     } else if (result == null) {
1934                         result = script;
1935                     }
1936                 }
1937                 if (result != null) {
1938                     return result;
1939                 }
1940             }
1941             CLDRFile cldrFile;
1942             try {
1943                 cldrFile = factory.make(locale, true);
1944             } catch (RuntimeException e) {
1945                 result = FALLBACK_SCRIPTS.get(locale);
1946                 if (result == null) {
1947                     System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale));
1948                     return result = UNKNOWN_SCRIPT;
1949                 } else {
1950                     return result;
1951                 }
1952             }
1953             UnicodeSet exemplars = getExemplarSet(cldrFile, "");
1954             Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars);
1955             CLDRScripts.remove(UNKNOWN_SCRIPT);
1956             if (CLDRScripts.size() == 1) {
1957                 return result = CLDRScripts.iterator().next();
1958             } else if (CLDRScripts.size() == 0) {
1959                 System.out.println("**Failed to get script for:\t" + locale);
1960                 return result = UNKNOWN_SCRIPT;
1961             } else {
1962                 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts);
1963                 return result = UNKNOWN_SCRIPT;
1964             }
1965         } finally {
1966             if (result.equals(UNKNOWN_SCRIPT)) {
1967                 String temp = LANGUAGE_OVERRIDES.get(locale);
1968                 if (temp != null) {
1969                     result = new LanguageTagParser().set(temp).getScript();
1970                     System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result);
1971                 }
1972             }
1973             localeToScriptCache.put(locale, result);
1974             if (SHOW_ADD)
1975                 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t"
1976                     + english.getName(CLDRFile.SCRIPT_NAME, result));
1977         }
1978     }
1979 
1980     // private static Map<String, String> closeMapping(Map<String, String> fluffup) {
1981     // if (SHOW_ADD) System.out.flush();
1982     // Map<String,String> temp = new TreeMap<String,String>();
1983     // while (true) {
1984     // temp.clear();
1985     // for (String locale : fluffup.keySet()) {
1986     // String target = fluffup.get(locale);
1987     // if (target.equals("si_Sinh") || target.equals("zh-Hani")) {
1988     // System.out.println("????");
1989     // }
1990     // String furtherTarget = fluffup.get(target);
1991     // if (furtherTarget == null) {
1992     // continue;
1993     // }
1994     // addIfNotIn(locale, furtherTarget, temp, null, "Close");
1995     // }
1996     // if (temp.size() == 0) {
1997     // break;
1998     // }
1999     // fluffup.putAll(temp);
2000     // }
2001     // if (SHOW_ADD) System.out.flush();
2002     // return temp;
2003     // }
2004 
getScriptsFromUnicodeSet(UnicodeSet exemplars)2005     public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) {
2006         // use bits first, since that's faster
2007         BitSet scriptBits = new BitSet();
2008         boolean show = false;
2009         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
2010             if (show)
2011                 System.out.println(Integer.toHexString(it.codepoint));
2012             if (it.codepoint != UnicodeSetIterator.IS_STRING) {
2013                 scriptBits.set(UScript.getScript(it.codepoint));
2014             } else {
2015                 int cp;
2016                 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) {
2017                     scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i)));
2018                 }
2019             }
2020         }
2021         scriptBits.clear(UScript.COMMON);
2022         scriptBits.clear(UScript.INHERITED);
2023         Set<String> scripts = new TreeSet<String>();
2024         for (int j = 0; j < scriptBits.size(); ++j) {
2025             if (scriptBits.get(j)) {
2026                 scripts.add(UScript.getShortName(j));
2027             }
2028         }
2029         return scripts;
2030     }
2031 
getExemplarSet(CLDRFile cldrfile, String type)2032     public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) {
2033         if (type.length() != 0)
2034             type = "[@type=\"" + type + "\"]";
2035         String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters"
2036             + type);
2037         if (v == null)
2038             return new UnicodeSet();
2039         return new UnicodeSet(v);
2040     }
2041 
2042     // private static String[][] SpecialCases = {
2043     // { "zh_Hani", "zh_Hans_CN"},
2044     // { "si_Sinh", "si_Sinh_LK"},
2045     // { "ii", "ii_CN"}, // Sichuan Yi (Yi)
2046     // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics)
2047     // { "und", "en"}, // English default
2048     // };
2049 
showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)2050     static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) {
2051         Set<String> errors = new LinkedHashSet<String>();
2052         Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(
2053             ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors);
2054         if (!errors.isEmpty()) {
2055             System.out.println(CollectionUtilities.join(errors, "\n"));
2056             errors.clear();
2057         }
2058         Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
2059             new TreeMap<String, String>(), errors);
2060         if (!errors.isEmpty()) {
2061             System.out.println("Default Content errors: " + CollectionUtilities.join(errors, "\n"));
2062             errors.clear();
2063         }
2064         Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent,
2065             "ar", "ar_001");
2066         System.out.println(CollectionUtilities.join(changes, "\n"));
2067         defaultLocaleContent.clear();
2068         defaultLocaleContent.addAll(newDefaultContent.values());
2069         newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent,
2070             new TreeMap<String, String>(), errors);
2071         if (!errors.isEmpty()) {
2072             System.out.println("***New Errors: " + CollectionUtilities.join(errors, "\n"));
2073         }
2074     }
2075 
compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)2076     private static Set<String> compareMapsAndFixNew(String title,
2077         Map<String, String> oldContent,
2078         Map<String, String> newContent, String... allowedOverrideValues) {
2079         Map<String, String> allowedOverrideValuesTest = new HashMap<String, String>();
2080         for (int i = 0; i < allowedOverrideValues.length; i += 2) {
2081             allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]);
2082         }
2083         Set<String> changes = new TreeSet<String>();
2084         for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet())
2085             .addAll(oldContent.keySet()).get()) {
2086             String oldValue = oldContent.get(parent);
2087             String newValue = newContent.get(parent);
2088             String overrideValue = allowedOverrideValuesTest.get(parent);
2089             if (overrideValue != null) {
2090                 newContent.put(parent, overrideValue);
2091                 newValue = overrideValue;
2092             }
2093             if (CldrUtility.equals(oldValue, newValue)) {
2094                 continue;
2095             }
2096             String message;
2097             if (oldValue == null) {
2098                 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2099                     + ConvertLanguageData.getLanguageCodeAndName(newValue);
2100                 newContent.put(parent, newValue);
2101             } else if (newValue == null) {
2102                 if (SUPPRESS_CHANGES) {
2103                     message = "Suppressing removal of "
2104                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2105                         + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2106                     newContent.put(parent, oldValue);
2107                 } else {
2108                     message = "Removing "
2109                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2110                         + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2111                     newContent.remove(oldValue);
2112                 }
2113             } else {
2114                 if (SUPPRESS_CHANGES) {
2115                     message = "Suppressing change of "
2116                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2117                         + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
2118                         + ConvertLanguageData.getLanguageCodeAndName(newValue);
2119                     newContent.remove(newValue);
2120                     newContent.put(parent, oldValue);
2121                 } else {
2122                     message = "Changing "
2123                         + ConvertLanguageData.getLanguageCodeAndName(parent) + " => "
2124                         + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to "
2125                         + ConvertLanguageData.getLanguageCodeAndName(newValue);
2126                     newContent.remove(oldValue);
2127                     newContent.put(parent, newValue);
2128                 }
2129             }
2130             changes.add(title + message);
2131         }
2132         return changes;
2133     }
2134 
2135     public static class LocaleStringComparator implements Comparator<String> {
2136         LanguageTagParser ltp0 = new LanguageTagParser();
2137         LanguageTagParser ltp1 = new LanguageTagParser();
2138 
compare(String arg0, String arg1)2139         public int compare(String arg0, String arg1) {
2140             ltp0.set(arg0);
2141             ltp1.set(arg1);
2142             String s0 = ltp0.getLanguage();
2143             String s1 = ltp1.getLanguage();
2144             int result = s0.compareTo(s1);
2145             if (result != 0) {
2146                 return s0.equals("und") ? 1
2147                     : s1.equals("und") ? -1
2148                         : result;
2149             }
2150             s0 = ltp0.getScript();
2151             s1 = ltp1.getScript();
2152             result = s0.compareTo(s1);
2153             if (result != 0) {
2154                 return result;
2155             }
2156             s0 = ltp0.getRegion();
2157             s1 = ltp1.getRegion();
2158             result = s0.compareTo(s1);
2159             if (result != 0) {
2160                 return result;
2161             }
2162             return arg0.compareTo(arg1); // just in case
2163         }
2164 
2165     }
2166 }
2167