• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.PrintWriter;
7 import java.nio.file.Files;
8 import java.text.ParseException;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.Collection;
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.EnumMap;
15 import java.util.HashMap;
16 import java.util.HashSet;
17 import java.util.Iterator;
18 import java.util.LinkedHashSet;
19 import java.util.List;
20 import java.util.Map;
21 import java.util.Set;
22 import java.util.TreeMap;
23 import java.util.TreeSet;
24 import java.util.regex.Matcher;
25 
26 import org.unicode.cldr.draft.FileUtilities;
27 import org.unicode.cldr.draft.ScriptMetadata;
28 import org.unicode.cldr.draft.ScriptMetadata.IdUsage;
29 import org.unicode.cldr.draft.ScriptMetadata.Info;
30 import org.unicode.cldr.util.Builder;
31 import org.unicode.cldr.util.CLDRFile;
32 import org.unicode.cldr.util.CLDRPaths;
33 import org.unicode.cldr.util.CldrUtility;
34 import org.unicode.cldr.util.Factory;
35 import org.unicode.cldr.util.Iso639Data;
36 import org.unicode.cldr.util.Iso639Data.Scope;
37 import org.unicode.cldr.util.Iso639Data.Source;
38 import org.unicode.cldr.util.Iso639Data.Type;
39 import org.unicode.cldr.util.LanguageTagCanonicalizer;
40 import org.unicode.cldr.util.LanguageTagParser;
41 import org.unicode.cldr.util.LocaleIDParser;
42 import org.unicode.cldr.util.LocaleIDParser.Level;
43 import org.unicode.cldr.util.Pair;
44 import org.unicode.cldr.util.PatternCache;
45 import org.unicode.cldr.util.SpreadSheet;
46 import org.unicode.cldr.util.StandardCodes;
47 import org.unicode.cldr.util.StandardCodes.LstrType;
48 import org.unicode.cldr.util.SupplementalDataInfo;
49 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
50 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
51 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
52 import org.unicode.cldr.util.TransliteratorUtilities;
53 import org.unicode.cldr.util.Validity;
54 import org.unicode.cldr.util.Validity.Status;
55 import org.unicode.cldr.util.XPathParts;
56 import org.unicode.cldr.util.XPathParts.Comments;
57 
58 import com.google.common.base.Joiner;
59 import com.google.common.collect.ImmutableSet;
60 import com.google.common.math.DoubleMath;
61 import com.ibm.icu.impl.Relation;
62 import com.ibm.icu.impl.Row;
63 import com.ibm.icu.impl.Row.R2;
64 import com.ibm.icu.text.Collator;
65 import com.ibm.icu.text.NumberFormat;
66 import com.ibm.icu.text.RuleBasedCollator;
67 import com.ibm.icu.text.UTF16;
68 import com.ibm.icu.util.ULocale;
69 
70 /**
71  * @author markdavis
72  *
73  */
74 public class ConvertLanguageData {
75 
76     private static final boolean DEBUG = false;
77     // change this if you need to override what is generated for the default contents.
78     private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+"));
79 
80     public static final boolean SHOW_DIFF = false;
81 
82     private static final boolean ALLOW_SMALL_NUMBERS = true;
83 
84     static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator();
85     static final Comparator<String> INVERSE_GENERAL = new InverseComparator<>(GENERAL_COLLATOR);
86 
87     private static StandardCodes sc = StandardCodes.make();
88 
89     static final double populationFactor = 1;
90     static final double gdpFactor = 1;
91     static final int BAD_COUNTRY_NAME = 0, COUNTRY_CODE = 1, COUNTRY_POPULATION = 2, COUNTRY_LITERACY = 3,
92         COUNTRY_GDP = 4, OFFICIAL_STATUS = 5, BAD_LANGUAGE_NAME = 6, LANGUAGE_CODE = 7, LANGUAGE_POPULATION = 8,
93         LANGUAGE_LITERACY = 9, COMMENT = 10, NOTES = 11;
94     static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<>();
95     static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<>();
96 
97     private static final double NON_OFFICIAL_WEIGHT = 0.40;
98 
99     private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false;
100 
101     private static final ImmutableSet<String> scriptAssumedLocales = ImmutableSet.of(
102         "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN");
103 
104     static Set<String> skipLocales = new HashSet<>(
105         Arrays
106             .asList(
107                 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root"
108                     .split("\\s")));
109 
110     static Map<String, String> defaultContent = new TreeMap<>();
111 
112     static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
113     static CLDRFile english = cldrFactory.make("en", true);
114 
115     static SupplementalDataInfo supplementalData = SupplementalDataInfo
116         .getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY);
117 
main(String[] args)118     public static void main(String[] args) throws IOException, ParseException {
119         final File oldSupp = new File(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml");
120         final File genSupp = new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml");
121         final File genLsraw = new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script_raw.txt");
122         try (
123             final BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSupp);
124             final PrintWriter newFile = FileUtilities.openUTF8Writer(genSupp);
125             final PrintWriter newLsraw = FileUtilities.openUTF8Writer(genLsraw);
126         ) {
127             // load elements we care about
128             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<languageData>\\s*"), newFile, false);
129 
130             Set<String> available = cldrFactory.getAvailable();
131 
132             Set<String> cldrParents = getCldrParents(available);
133 
134             List<String> failures = new ArrayList<>();
135             Map<String, RowData> localeToRowData = new TreeMap<>();
136 
137             Set<RowData> sortedInput = getExcelData(failures, localeToRowData);
138 
139             // get the locales (including parents)
140             Set<String> localesWithData = new TreeSet<>(localeToRowData.keySet());
141             for (String locale : localeToRowData.keySet()) {
142                 while (true) {
143                     String parent = LocaleIDParser.getParent(locale);
144                     if (parent == null) break;
145                     localesWithData.add(parent);
146                     locale = parent;
147                 }
148             }
149 
150             final LanguageTagParser languageTagParser = new LanguageTagParser();
151 
152             for (String localeRaw : available) {
153                 String locale = languageTagCanonicalizer.transform(localeRaw);
154                 if (!localesWithData.contains(locale)) {
155                     CLDRFile locFile = cldrFactory.make(localeRaw, false);
156                     if (locFile.isAliasedAtTopLevel()) {
157                         continue;
158                     }
159                     if (scriptAssumedLocales.contains(locale)) {
160                         continue;
161                     }
162                     languageTagParser.set(locale);
163                     if (languageTagParser.getVariants().size() != 0) {
164                         continue;
165                     }
166                     String withoutScript = languageTagParser.setScript("").toString();
167                     if (!localesWithData.contains(withoutScript)) {
168                         String region = new LanguageTagParser().set(locale).getRegion();
169                         if (StandardCodes.isCountry(region)) {
170                             BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale));
171                         }
172                     } else {
173                         // These exceptions are OK, because these locales by default use the non-default script
174                         Set<String> OKExceptions = ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO");
175                         if (OKExceptions.contains(locale)) {
176                             continue;
177                         }
178                         BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale)
179                             + " but have data for " + getLanguageCodeAndName(withoutScript));
180                     }
181                 }
182             }
183 
184             // TODO sort by country code, then functionalPopulation, then language code
185             // and keep the top country for each language code (even if < 1%)
186 
187             addLanguageScriptData();
188 
189             // showAllBasicLanguageData(allLanguageData, "old");
190             getLanguage2Scripts(sortedInput);
191 
192             writeNewBasicData2(newFile, sortedInput);
193             // writeNewBasicData(sortedInput);
194 
195             writeTerritoryLanguageData(newFile, failures, sortedInput);
196 
197             checkBasicData(localeToRowData);
198 
199             Set<String> defaultLocaleContent = new TreeSet<>();
200 
201             showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent);
202 
203             // showContent(available);
204 
205             // certain items are overridden
206 
207             List<String> toRemove = new ArrayList<>();
208             for (String override : defaultOverrides) {
209                 String replacement = getReplacement(override, defaultLocaleContent);
210                 if (replacement != null) {
211                     toRemove.add(replacement);
212                 }
213             }
214             defaultLocaleContent.removeAll(toRemove);
215             defaultLocaleContent.addAll(defaultOverrides);
216 
217             showFailures(failures);
218 
219             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false);
220             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), newFile, false);
221             // generateIso639_2Data(newFile);
222             references.printReferences(newFile);
223             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false);
224             CldrUtility.copyUpTo(oldFile, null, newFile, false);
225 
226             getLanguageScriptSpreadsheet(newLsraw);
227         } catch (Exception e) {
228             e.printStackTrace();
229         } finally {
230             System.out.println("Wrote: " + genLsraw);
231             System.out.println("Wrote: " + genSupp);
232             System.out.println("Copying " + genSupp + " to " + oldSupp);
233             oldSupp.delete();
234             Files.copy(genSupp.toPath(), oldSupp.toPath());
235             System.out.println("DONE");
236         }
237     }
238 
getLanguageCodeAndName(String code)239     public static String getLanguageCodeAndName(String code) {
240         if (code == null) return null;
241         return english.getName(code) + " [" + code + "]";
242     }
243 
getReplacement(String oldDefault, Set<String> defaultLocaleContent)244     private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) {
245         String parent = LocaleIDParser.getParent(oldDefault);
246         for (String replacement : defaultLocaleContent) {
247             if (replacement.startsWith(parent)) {
248                 if (parent.equals(LocaleIDParser.getParent(replacement))) {
249                     return replacement;
250                 }
251             }
252         }
253         return null;
254     }
255 
getLanguageScriptSpreadsheet(PrintWriter out)256     private static void getLanguageScriptSpreadsheet(PrintWriter out) {
257         out.println("#Lcode\tLanguageName\tStatus\tScode\tScriptName\tReferences");
258         Pair<String, String> languageScript = new Pair<>("", "");
259         for (String language : language_status_scripts.keySet()) {
260             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
261             for (BasicLanguageData.Type status : status_scripts.keySet()) {
262                 for (String script : status_scripts.getAll(status)) {
263                     String reference = language_script_references.get(languageScript.setFirst(language).setSecond(
264                         script));
265                     out.println(language + "\t" + getLanguageName(language) + "\t" + status + "\t" + script + "\t"
266                         + getDisplayScript(script)
267                         + (reference == null ? "" : "\t" + reference));
268                 }
269             }
270         }
271     }
272 
273     /**
274      * Write data in format:
275      * <languageData>
276      * <language type="aa" scripts="Latn" territories="DJ ER ET"/>
277      *
278      * @param sortedInput
279      */
writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput)280     private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput) {
281         double cutoff = 0.2; // 20%
282 
283         // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), TreeSet.class);
284         LanguageTagParser ltp = new LanguageTagParser();
285         Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = new TreeMap<>();
286         //Map<String, Pair<String, String>> languageToBestCountry;
287         for (RowData rowData : sortedInput) {
288             if (rowData.countryCode.equals("ZZ")) continue;
289             ltp.set(rowData.languageCode);
290             String languageCode = ltp.getLanguage();
291             Relation<BasicLanguageData.Type, String> status_territories = language_status_territories.get(languageCode);
292             if (status_territories == null) {
293                 language_status_territories.put(languageCode, status_territories = Relation.of(
294                     new TreeMap<BasicLanguageData.Type, Set<String>>(),
295                     TreeSet.class));
296             }
297             if (rowData.officialStatus.isMajor()) {
298                 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode);
299             } else if (rowData.officialStatus.isOfficial()
300                 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation
301                 || rowData.getLanguagePopulation() >= 1000000) {
302                 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode);
303             }
304         }
305 
306         Set<String> allLanguages = new TreeSet<>(language_status_territories.keySet());
307         allLanguages.addAll(language_status_scripts.keySet());
308         // now add all the remaining language-script info
309         // <language type="sv" scripts="Latn" territories="AX FI SE"/>
310         Set<String> warnings = new LinkedHashSet<>();
311         out.println("\t<languageData>");
312         for (String languageSubtag : allLanguages) {
313             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(languageSubtag);
314             Relation<BasicLanguageData.Type, String> status_territories = language_status_territories
315                 .get(languageSubtag);
316 
317             // check against old:
318             Map<BasicLanguageData.Type, BasicLanguageData> oldData = supplementalData
319                 .getBasicLanguageDataMap(languageSubtag);
320             if (oldData == null) {
321                 oldData = Collections.emptyMap();
322             }
323 
324             EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = new EnumMap<>(
325                 BasicLanguageData.Type.class);
326             for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) {
327                 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status);
328                 Set<String> territories = status_territories == null ? null : status_territories.getAll(status);
329                 if (scripts == null && territories == null) continue;
330                 BasicLanguageData bld = new BasicLanguageData();
331                 bld.setTerritories(territories);
332                 bld.setScripts(scripts);
333                 bld.setType(status);
334                 bld.freeze();
335                 newData.put(status, bld);
336             }
337 
338             // compare
339             if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) {
340                 for (String problem : compare(oldData, newData)) {
341                     warnings.add(BadItem.DETAIL.toString("changing <languageData>", languageSubtag
342                         + "\t" + english.getName(languageSubtag), problem));
343                 }
344             }
345 
346             for (BasicLanguageData bld : newData.values()) {
347                 Set<String> scripts = bld.getScripts();
348                 Set<String> territories = bld.getTerritories();
349                 BasicLanguageData.Type status = bld.getType();
350                 out.println("\t\t<language type=\"" + languageSubtag + "\""
351                     + (scripts.isEmpty() ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"")
352                     + (territories.isEmpty() ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"")
353                     + (status == BasicLanguageData.Type.primary ? "" : " alt=\"secondary\"")
354                     + "/>");
355             }
356         }
357         out.println("\t</languageData>");
358         for (String s : warnings) {
359             if (s.contains("!")) {
360                 System.out.println(s);
361             }
362         }
363         for (String s : warnings) {
364             if (!s.contains("!")) {
365                 System.out.println(s);
366             }
367         }
368     }
369 
compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)370     private static List<String> compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData,
371         Map<BasicLanguageData.Type, BasicLanguageData> newData) {
372         Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true);
373         Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true);
374         List<String> result = new ArrayList<>();
375         StringBuilder temp = new StringBuilder();
376         for (String s : Builder.with(new LinkedHashSet<String>()).addAll(oldDataToType.keySet())
377             .addAll(newDataToType.keySet()).get()) {
378             BasicLanguageData.Type oldValue = oldDataToType.get(s);
379             BasicLanguageData.Type newValue = newDataToType.get(s);
380             if (!CldrUtility.equals(oldValue, newValue)) {
381                 temp.setLength(0);
382                 temp.append("[").append(s).append(":")
383                     .append(english.getName(s.length() == 4 ? "script" : "region", s)).append("] ");
384                 if (oldValue == null) {
385                     temp.append(" added as ").append(newValue);
386                 } else if (newValue == null) {
387                     temp.append(" REMOVED!");
388                 } else if (oldValue == BasicLanguageData.Type.primary) {
389                     temp.append(" DOWNGRADED TO! ").append(newValue);
390                 } else {
391                     temp.append(" upgraded to ").append(newValue);
392                 }
393                 result.add(temp.toString());
394             }
395         }
396         result.add(newData.toString());
397         return result;
398     }
399 
getDataToType( Collection<BasicLanguageData> collection, boolean script)400     private static Map<String, BasicLanguageData.Type> getDataToType(
401         Collection<BasicLanguageData> collection, boolean script) {
402         Map<String, BasicLanguageData.Type> result = new TreeMap<>();
403         for (BasicLanguageData i : collection) {
404             for (String s : i.getScripts()) {
405                 result.put(s, i.getType());
406             }
407             for (String s : i.getTerritories()) {
408                 result.put(s, i.getType());
409             }
410         }
411         return result;
412     }
413 
checkBasicData(Map<String, RowData> localeToRowData)414     private static void checkBasicData(Map<String, RowData> localeToRowData) {
415         // find languages with multiple scripts
416         Relation<String, String> languageToScripts = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
417         for (String languageSubtag : language2BasicLanguageData.keySet()) {
418             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
419                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), item.getScripts());
420             }
421         }
422         // get primary combinations
423         Set<String> primaryCombos = new TreeSet<>();
424         Set<String> basicCombos = new TreeSet<>();
425         for (String languageSubtag : language2BasicLanguageData.keySet()) {
426             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
427                 Set<String> scripts = new TreeSet<>();
428                 scripts.addAll(item.getScripts());
429                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts);
430                 if (scripts.size() == 0) {
431                     scripts.add("Zzzz");
432                 }
433                 Set<String> territories = new TreeSet<>();
434                 territories.addAll(item.getTerritories());
435                 if (territories.size() == 0) {
436                     territories.add("ZZ");
437                     continue;
438                 }
439 
440                 for (String script : scripts) {
441                     for (String territory : territories) {
442                         String locale = StandardCodes.fixLanguageTag(languageSubtag)
443                             // + (script.equals("Zzzz") ? "" : languageToScripts.getAll(languageSubtag).size() <= 1 ? ""
444                             // : "_" + script)
445                             + (territories.equals("ZZ") ? "" : "_" + territory);
446                         if (item.getType() != BasicLanguageData.Type.secondary) {
447                             primaryCombos.add(locale);
448                         }
449                         basicCombos.add(locale);
450                     }
451                 }
452             }
453         }
454         Set<String> populationOver20 = new TreeSet<>();
455         Set<String> population = new TreeSet<>();
456         LanguageTagParser ltp = new LanguageTagParser();
457         for (String rawLocale : localeToRowData.keySet()) {
458             ltp.set(rawLocale);
459             String locale = ltp.getLanguage() + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion());
460             population.add(locale);
461             RowData rowData = localeToRowData.get(rawLocale);
462             if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2
463             //|| rowData.getLanguagePopulation() > 900000
464             ) {
465                 populationOver20.add(locale);
466             } else {
467                 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(
468                     ltp.getLanguageScript(), ltp.getRegion());
469                 if (popData != null && popData.getOfficialStatus().isOfficial()) {
470                     populationOver20.add(locale);
471                 }
472             }
473         }
474         Set<String> inBasicButNotPopulation = new TreeSet<>(primaryCombos);
475 
476         inBasicButNotPopulation.removeAll(population);
477         for (String locale : inBasicButNotPopulation) {
478             ltp.set(locale);
479             String region = ltp.getRegion();
480             String language = ltp.getLanguage();
481             if (!sc.isModernLanguage(language)) continue;
482             PopulationData popData = supplementalData.getPopulationDataForTerritory(region);
483             // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10%
484             BadItem.WARNING.show("In Basic Data but not Population > 20%",
485                 getDisplayCountry(region)
486                     + "\t" + region
487                     + "\t\"" + formatNumber(popData.getPopulation(), 0, false) + "\""
488                     + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false)
489                     + "\""
490                     + "\t\"" + formatPercent(popData.getGdp(), 0, false) + "\""
491                     + "\t" + ""
492                     + "\t" + getLanguageName(language)
493                     + "\t" + language
494                     + "\t" + -1
495                     + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false)
496                     + "\"");
497         }
498 
499         Set<String> inPopulationButNotBasic = new TreeSet<>(populationOver20);
500         inPopulationButNotBasic.removeAll(basicCombos);
501         for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext();) {
502             String locale = it.next();
503             if (locale.endsWith("_ZZ")) {
504                 it.remove();
505             }
506         }
507         for (String locale : inPopulationButNotBasic) {
508             BadItem.WARNING.show("In Population>20% but not Basic Data", locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString());
509         }
510     }
511 
512     static class LanguageInfo {
513         static LanguageInfo INSTANCE = new LanguageInfo();
514 
515         Map<String, Set<String>> languageToScripts = new TreeMap<>();
516         Map<String, Set<String>> languageToRegions = new TreeMap<>();
517         Map<String, Comments> languageToComments = new TreeMap<>();
518 
519         Map<String, Set<String>> languageToScriptsAlt = new TreeMap<>();
520         Map<String, Set<String>> languageToRegionsAlt = new TreeMap<>();
521         Map<String, Comments> languageToCommentsAlt = new TreeMap<>();
522 
LanguageInfo()523         private LanguageInfo() {
524             cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
525             //Set<String> available = cldrFactory.getAvailable();
526             CLDRFile supplemental = cldrFactory.make("supplementalData", true);
527             for (Iterator<String> it = supplemental.iterator("//supplementalData/languageData/language"); it.hasNext();) {
528                 String xpath = it.next();
529                 XPathParts parts = XPathParts.getFrozenInstance(xpath);
530                 Map<String, String> x = parts.getAttributes(-1);
531                 boolean alt = x.containsKey("alt");
532                 String lang = x.get("type");
533                 List<String> scripts = getAttributeList(x, "scripts");
534                 if (scripts != null) {
535                     if (alt) {
536                         putAll(languageToScriptsAlt, lang, new LinkedHashSet<>(scripts));
537                     } else {
538                         putAll(languageToScripts, lang, new LinkedHashSet<>(scripts));
539                     }
540                 }
541                 List<String> regions = getAttributeList(x, "territories");
542                 if (regions != null) {
543                     if (alt) {
544                         putAll(languageToRegionsAlt, lang, new LinkedHashSet<>(regions));
545                     } else {
546                         putAll(languageToRegions, lang, new LinkedHashSet<>(regions));
547                     }
548                 }
549             }
550         }
551 
getAttributeList(Map<String, String> x, String attribute)552         private List<String> getAttributeList(Map<String, String> x, String attribute) {
553             List<String> scripts = null;
554             String scriptString = x.get(attribute);
555             if (scriptString != null) {
556                 scripts = Arrays.asList(scriptString.split("\\s+"));
557             }
558             return scripts;
559         }
560     }
561 
putUnique(Map<K, V> map, K key, V value)562     private static <K, V> void putUnique(Map<K, V> map, K key, V value) {
563         V oldValue = map.get(key);
564         if (oldValue != null && !oldValue.equals(value)) {
565             throw new IllegalArgumentException("Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">");
566         }
567         map.put(key, value);
568     }
569 
putAll(Map<K, Set<W>> map, K key, Set<W> values)570     private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) {
571         Set<W> oldValue = map.get(key);
572         if (oldValue == null) {
573             map.put(key, values);
574         } else {
575             oldValue.addAll(values);
576         }
577     }
578 
579     // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, official_minority};
580 
581     static class RowData implements Comparable<Object> {
582         private final String countryCode;
583         private final double countryGdp;
584         private final double countryLiteracy;
585         private final double countryPopulation;
586         private final String languageCode;
587         private final OfficialStatus officialStatus;
588         private final double languagePopulation;
589         private final double languageLiteracy;
590         private final String comment;
591         private final String notes;
592         private final String badLanguageName;
593         private final boolean relativeLanguagePopulation;
594         // String badLanguageCode = "";
595         private final static Set<String> doneCountries = new HashSet<>();
596 
597         private final static Set<String> countryCodes = sc.getGoodAvailableCodes("territory");
598 
RowData(String country, String language)599         public RowData(String country, String language) {
600             this.countryCode = country;
601             this.languageCode = language;
602             badLanguageName = country = language = notes = comment = "";
603             officialStatus = OfficialStatus.unknown;
604             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
605             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
606             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
607             languagePopulation = languageLiteracy = Double.NaN;
608             relativeLanguagePopulation = false;
609         }
610 
RowData(List<String> row)611         RowData(List<String> row) throws ParseException {
612             countryCode = fixCountryCode(row.get(COUNTRY_CODE), row);
613 
614             if (!countryCodes.contains(countryCode)) {
615                 System.err.println("WRONG COUNTRY CODE: " + row);
616             }
617 
618             double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION));
619             double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1);
620 
621             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
622             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
623             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
624 
625             String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_');
626             if (officialStatusString.equals("national")) {
627                 officialStatusString = "official";
628             } else if (officialStatusString.equals("regional_official")) {
629                 officialStatusString = "official_regional";
630             } else if (officialStatusString.length() == 0 || officialStatusString.equals("uninhabited")) {
631                 officialStatusString = "unknown";
632             }
633             try {
634                 officialStatus = OfficialStatus.valueOf(officialStatusString);
635             } catch (RuntimeException e) {
636                 throw new IllegalArgumentException("Can't interpret offical-status: " + officialStatusString);
637             }
638 
639             String languageCode1 = row.get(LANGUAGE_CODE);
640             if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) {
641                 languageCode1 = languageCode1.substring(1);
642             }
643             languageCode = fixLanguageCode(languageCode1, row);
644 
645             if (doneCountries.contains(countryCode) == false) {
646                 // showDiff(countryGdp1, countryGdp);
647                 // showDiff(countryLiteracy1, countryLiteracy);
648                 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false);
649                 doneCountries.add(countryCode);
650             }
651 
652             double languagePopulation1 = parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1)
653                 * countryPopulation1;
654             if ((officialStatus.isMajor())
655                 && languagePopulation1 * 100 < countryPopulation && languagePopulation1 < 1000000) {
656                 BadItem.WARNING.show("official language has population < 1% of country & < 1,000,000", languageCode + ", " + Math.round(languagePopulation1),
657                     row);
658             }
659             if (languagePopulation1 < 0.999) {
660                 BadItem.WARNING.show("suspect language population, < 1", languageCode + ", " + Math.round(languagePopulation1), row);
661             }
662             if (languagePopulation1 > 10000) {
663                 relativeLanguagePopulation = true;
664                 languagePopulation1 = languagePopulation1 * countryPopulation / countryPopulation1; // correct the
665                 // values
666             } else {
667                 relativeLanguagePopulation = false;
668             }
669             if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) {
670                 BadItem.ERROR.show("language population > country population", Math.round(languagePopulation1) + " > " + countryPopulation, row);
671             }
672             languagePopulation = languagePopulation1 < countryPopulation ? languagePopulation1 : countryPopulation;
673 
674             if (SHOW_DIFF)
675                 showDiff(languagePopulation1 / countryPopulation1, languagePopulation / countryPopulation, 0.01, true);
676 
677             String stringLanguageLiteracy = row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY);
678             double languageLiteracy1 = stringLanguageLiteracy.length() == 0 ? countryLiteracy
679                 : parsePercent(stringLanguageLiteracy, languagePopulation);
680             if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) {
681                 languageLiteracy1 = countryLiteracy; // correct the values
682             }
683             languageLiteracy = languageLiteracy1;
684 
685             if (row.size() > COMMENT) {
686                 comment = row.get(COMMENT);
687             } else {
688                 comment = "";
689             }
690             if (row.size() > NOTES) {
691                 notes = row.get(NOTES);
692             } else {
693                 notes = "";
694             }
695             badLanguageName = row.get(BAD_LANGUAGE_NAME);
696         }
697 
showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)698         private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) {
699             final double diff = new_a / a - 1;
700             if (Math.abs(diff) > maxRelativeDiff) {
701                 System.out.println(formatPercent(diff, 0, false)
702                     + "\t" + countryCode + "\t" + getDisplayCountry(countryCode)
703                     + (showLang ? "\t" + languageCode + "\t" + getLanguageName(languageCode) : "")
704                     + "\t" + formatNumber(a, 0, false) + "\t=>\t" + formatNumber(new_a, 0, false));
705             }
706         }
707 
roundToPartsPer(double a, double whole)708         private double roundToPartsPer(double a, double whole) {
709             // break this out just to make it easier to follow.
710             double log10 = Math.log10(a / whole);
711             long digitsFound = (long) (log10);
712             long factor = (long) (Math.pow(10, digitsFound));
713             double rounded = Math.round(a / factor);
714             double result = rounded * factor;
715             // if (Math.abs(result - a) >= 1) {
716             // System.out.println("Rounding " + a + " => " + result);
717             // }
718             return result;
719         }
720 
isApproximatelyEqual(double a, double b, double epsilon)721         private static boolean isApproximatelyEqual(double a, double b, double epsilon) {
722             return a == b || Math.abs(a - b) < epsilon;
723         }
724 
isApproximatelyGreater(double a, double b, double epsilon)725         private static boolean isApproximatelyGreater(double a, double b, double epsilon) {
726             return a > b + epsilon;
727         }
728 
parseDecimal(String numericRepresentation)729         double parseDecimal(String numericRepresentation) throws ParseException {
730             try {
731                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN;
732                 Number result = nf.parse(numericRepresentation);
733                 // if (result == null) return Double.NaN;
734                 return result.doubleValue();
735             } catch (ParseException e) {
736                 throw e;
737                 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation +
738                 // ">").initCause(e);
739             }
740         }
741 
parsePercent(String numericRepresentation, double baseValue)742         double parsePercent(String numericRepresentation, double baseValue) throws ParseException {
743             try {
744                 double result;
745                 if (numericRepresentation.contains("%")) {
746                     Number result0 = pf.parse(numericRepresentation);
747                     result = result0.doubleValue();
748                 } else {
749                     Number result0 = nf.parse(numericRepresentation);
750                     result = result0.doubleValue() / baseValue;
751                 }
752                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN;
753                 // if (result == null) return Double.NaN;
754                 return result;
755             } catch (ParseException e) {
756                 throw e;
757                 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation +
758                 // ">").initCause(e);
759             }
760         }
761 
getLanguageLiteratePopulation()762         public double getLanguageLiteratePopulation() {
763             return languageLiteracy * languagePopulation;
764         }
765 
766         /**
767          * Get the weighted population
768          *
769          * @param weightIfNotOfficial
770          * @return
771          */
getLanguageLiteratePopulation(double weightIfNotOfficial)772         public double getLanguageLiteratePopulation(double weightIfNotOfficial) {
773             double result = languageLiteracy * languagePopulation;
774             if (!officialStatus.isMajor()) {
775                 result *= weightIfNotOfficial;
776             }
777             return result;
778         }
779 
780         @Override
compareTo(Object o)781         public int compareTo(Object o) {
782             RowData that = (RowData) o;
783             int result;
784             if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) return result;
785             if (languagePopulation > that.languagePopulation) return -1; // descending
786             if (languagePopulation < that.languagePopulation) return 1;
787             if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) return result;
788             return 0;
789         }
790 
toStringHeader()791         public static String toStringHeader() {
792             return "countryCode" + "\t" + "countryPopulation" + "\t" + "countryGdp"
793                 + "\t" + "countryLiteracy"
794                 + "\t" + "languagePopulation" + "\t" + "languageCode"
795                 + "\t" + "writingPopulation";
796         }
797 
798         @Override
toString()799         public String toString() {
800             return countryCode + "\t" + countryPopulation + "\t" + countryGdp
801                 + "\t" + countryLiteracy
802                 + "\t" + languagePopulation + "\t" + languageCode
803                 + "\t" + languageLiteracy;
804         }
805 
toString(boolean b)806         public String toString(boolean b) {
807             return "region:\t" + getCountryCodeAndName(countryCode)
808                 + "\tpop:\t" + countryPopulation
809                 + "\tgdp:\t" + countryGdp
810                 + "\tlit:\t" + countryLiteracy
811                 + "\tlang:\t" + getLanguageCodeAndName(languageCode)
812                 + "\tpop:\t" + languagePopulation
813                 + "\tlit:\t" + languageLiteracy;
814         }
815 
816         static boolean MARK_OUTPUT = false;
817 
getRickLanguageCode()818         public String getRickLanguageCode() {
819             if (languageCode.contains("_")) return languageCode;
820             Source source = Iso639Data.getSource(languageCode);
821             if (source == null) {
822                 return "§" + languageCode;
823             }
824             if (MARK_OUTPUT) {
825                 if (source == Source.ISO_639_3) {
826                     return "*" + languageCode;
827                 }
828             }
829             return languageCode;
830         }
831 
832         static Map<String, String> oldToFixed = new HashMap<>();
833 
getRickLanguageName()834         public String getRickLanguageName() {
835             String cldrResult = getExcelQuote(english.getName(languageCode, true));
836 //            String result = getRickLanguageName2();
837 //            if (!result.equalsIgnoreCase(cldrResult)) {
838 //                if (null == oldToFixed.put(result, cldrResult)) {
839 //                    System.out.println("## " + result + "!=" + cldrResult);
840 //                }
841 //            }
842             return cldrResult;
843         }
844 
getRickLanguageName2()845         public String getRickLanguageName2() {
846             String result = new ULocale(languageCode).getDisplayName();
847             if (!result.equals(languageCode)) return getExcelQuote(result);
848             Set<String> names = Iso639Data.getNames(languageCode);
849             if (names != null && names.size() != 0) {
850                 if (MARK_OUTPUT) {
851                     return getExcelQuote("*" + names.iterator().next());
852                 } else {
853                     return getExcelQuote(names.iterator().next());
854                 }
855             }
856             return getExcelQuote("§" + badLanguageName);
857         }
858 
getCountryName()859         public String getCountryName() {
860             return getExcelQuote(getDisplayCountry(countryCode));
861         }
862 
getCountryGdpString()863         public String getCountryGdpString() {
864             return getExcelQuote(formatNumber(countryGdp, 0, false));
865         }
866 
getCountryLiteracyString()867         public String getCountryLiteracyString() {
868             return formatPercent(countryLiteracy, 2, false);
869         }
870 
getCountryPopulationString()871         public String getCountryPopulationString() {
872             return getExcelQuote(formatNumber(countryPopulation, 0, false));
873         }
874 
getLanguageLiteracyString()875         public String getLanguageLiteracyString() {
876             return formatPercent(languageLiteracy, 2, false);
877         }
878 
getLanguagePopulationString()879         public String getLanguagePopulationString() {
880 
881             try {
882                 final double percent = languagePopulation / countryPopulation;
883                 return getExcelQuote(relativeLanguagePopulation
884                     && percent > 0.03
885                     && languagePopulation > 10000
886                         ? formatPercent(percent, 2, false)
887                         : formatNumber(languagePopulation, 3, false));
888             } catch (IllegalArgumentException e) {
889                 return "NaN";
890             }
891         }
892 
getLanguagePopulation()893         private double getLanguagePopulation() {
894             return languagePopulation;
895         }
896 
897     }
898 
getExcelQuote(String comment)899     public static String getExcelQuote(String comment) {
900         return comment == null || comment.length() == 0 ? ""
901             : comment.contains(",") ? '"' + comment + '"'
902                 : comment.contains("\"") ? '"' + comment.replace("\"", "\"\"") + '"'
903                     : comment;
904     }
905 
getCountryCodeAndName(String code)906     public static String getCountryCodeAndName(String code) {
907         if (code == null) return null;
908         return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]";
909     }
910 
911     static class RickComparator implements Comparator<RowData> {
912         @Override
compare(RowData me, RowData that)913         public int compare(RowData me, RowData that) {
914             int result;
915             if (0 != (result = GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) return result;
916             if (0 != (result = GENERAL_COLLATOR.compare(me.getRickLanguageName(), that.getRickLanguageName())))
917                 return result;
918             return me.compareTo(that);
919         }
920     }
921 
writeTerritoryLanguageData(PrintWriter out, List<String> failures, Set<RowData> sortedInput)922     private static void writeTerritoryLanguageData(PrintWriter out, List<String> failures, Set<RowData> sortedInput) {
923 
924         String lastCountryCode = "";
925         boolean first = true;
926         LanguageTagParser ltp = new LanguageTagParser();
927 
928         out.println(" <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->");
929         out.println("\t<territoryInfo>");
930 
931         for (RowData row : sortedInput) {
932             String countryCode = row.countryCode;
933 
934             double countryPopulationRaw = row.countryPopulation;
935             double countryPopulation = countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 2);
936             double languageLiteracy = row.languageLiteracy;
937             double countryLiteracy = row.countryLiteracy;
938 
939             double countryGDPRaw = row.countryGdp;
940             long countryGDP = Math.round(countryGDPRaw / gdpFactor);
941 
942             String languageCode = row.languageCode;
943 
944             double languagePopulationRaw = row.getLanguagePopulation();
945             double languagePopulation = languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw,
946             // 2);
947 
948             double languagePopulationPercent = languagePopulation / countryPopulation;
949             // Utility.roundToDecimals(Math.min(100, Math.max(0,
950             // languagePopulation*100 / (double)countryPopulation)),3);
951 
952             if (!countryCode.equals(lastCountryCode)) {
953                 if (first) {
954                     first = false;
955                 } else {
956                     out.println("\t\t</territory>");
957                 }
958                 out.print("\t\t<territory type=\"" + countryCode + "\""
959                     + " gdp=\"" + formatNumber(countryGDP, 4, true) + "\""
960                     + " literacyPercent=\"" + formatPercent(countryLiteracy, 3, true) + "\""
961                     + " population=\"" + formatNumber(countryPopulation, 6, true) + "\">");
962                 lastCountryCode = countryCode;
963                 out.println("\t<!--" + getDisplayCountry(countryCode) + "-->");
964             }
965 
966             if (languageCode.length() != 0
967                 && languagePopulationPercent > 0.0000
968                 && (ALLOW_SMALL_NUMBERS || languagePopulationPercent >= 1 || languagePopulationRaw > 100000
969                     || languageCode.equals("haw") || row.officialStatus.isOfficial())) {
970                 // add best case
971                 addBestRegion(languageCode, countryCode, languagePopulationRaw);
972                 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript();
973                 if (!baseScriptLanguage.equals(languageCode)) {
974                     addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw);
975                 }
976                 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage();
977                 if (!baseLanguage.equals(baseScriptLanguage)) {
978                     addBestRegion(baseLanguage, countryCode, languagePopulationRaw);
979                     addBestScript(baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw);
980                 }
981 
982                 if (languageLiteracy != countryLiteracy) {
983                     int debug = 0;
984                 }
985                 out.print("\t\t\t<languagePopulation type=\""
986                     + languageCode
987                     + "\""
988                     + (DoubleMath.fuzzyCompare(languageLiteracy, countryLiteracy, 0.0001) == 0 ? ""
989                         : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) == 0 ? " writingPercent=\"" : " literacyPercent=\"")
990                             + formatPercent(languageLiteracy, 2, true) + "\"")
991                     + " populationPercent=\"" + formatPercent(languagePopulationPercent, 2, true) + "\""
992                     + (row.officialStatus.isOfficial() ? " officialStatus=\"" + row.officialStatus + "\"" : "")
993                     + references.addReference(row.notes)
994                     + "/>");
995                 out.println("\t<!--" + getLanguageName(languageCode) + "-->");
996             } else if (!row.countryCode.equals("ZZ")) {
997                 failures.add(BadItem.ERROR.toString("too few speakers: suspect line", languageCode, row.toString(true)));
998             }
999             // if (first) {
1000             if (false) System.out.print(
1001                 "countryCode: " + countryCode + "\t"
1002                     + "countryPopulation: " + countryPopulation + "\t"
1003                     + "countryGDP: " + countryGDP + "\t"
1004                     + "languageCode: " + languageCode + "\t"
1005                     + "languagePopulation: " + languagePopulation + CldrUtility.LINE_SEPARATOR);
1006             // }
1007         }
1008 
1009         out.println("\t\t</territory>");
1010         out.println("\t</territoryInfo>");
1011     }
1012 
getDisplayCountry(String countryCode)1013     private static String getDisplayCountry(String countryCode) {
1014         String result = getULocaleCountryName(countryCode);
1015         if (!result.equals(countryCode)) {
1016             return result;
1017         }
1018         result = sc.getData("territory", countryCode);
1019         if (result != null) {
1020             return result;
1021         }
1022         return countryCode;
1023         // new ULocale("und-" + countryCode).getDisplayCountry()
1024     }
1025 
getDisplayScript(String scriptCode)1026     private static String getDisplayScript(String scriptCode) {
1027         String result = getULocaleScriptName(scriptCode);
1028         if (!result.equals(scriptCode)) {
1029             return result;
1030         }
1031         result = sc.getData("territory", scriptCode);
1032         if (result != null) {
1033             return result;
1034         }
1035         return scriptCode;
1036         // new ULocale("und-" + countryCode).getDisplayCountry()
1037     }
1038 
getLanguageName(String languageCode)1039     private static String getLanguageName(String languageCode) {
1040         String result = getULocaleLocaleName(languageCode);
1041         if (!result.equals(languageCode)) return result;
1042         Set<String> names = Iso639Data.getNames(languageCode);
1043         if (names != null && names.size() != 0) {
1044             return names.iterator().next();
1045         }
1046         return languageCode;
1047     }
1048 
1049     static class References {
1050         Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<>();
1051         Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<>();
1052         Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences();
1053         Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<>();
1054         {
1055             for (String Rxxx : Rxxx_to_oldReferences.keySet()) {
Rxxx_to_oldReferences.get(Rxxx)1056                 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx);
1057             }
1058         }
1059         Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher("");
1060 
1061         static int referenceStart = 1000;
1062 
1063         /**
1064          * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference.
1065          *
1066          * @param rawReferenceText
1067          * @return
1068          */
addReference(String rawReferenceText)1069         private String addReference(String rawReferenceText) {
1070             if (rawReferenceText == null || rawReferenceText.length() == 0) return "";
1071             Pair<String, String> p;
1072             if (URI.reset(rawReferenceText).matches()) {
1073                 p = new Pair<>(URI.group(1), URI.group(2) == null || URI.group(2).length() == 0 ? "[missing]"
1074                     : URI.group(2)).freeze();
1075             } else {
1076                 p = new Pair<String, String>(null, rawReferenceText).freeze();
1077             }
1078 
1079             String Rxxx = reference_to_Rxxx.get(p);
1080             if (Rxxx == null) { // add new
1081                 Rxxx = oldReferences_to_Rxxx.get(p);
1082                 if (Rxxx != null) { // if old, just keep number
1083                     p = Rxxx_to_oldReferences.get(Rxxx);
1084                 } else { // find an empty number
1085                     while (true) {
1086                         Rxxx = "R" + (referenceStart++);
1087                         if (Rxxx_to_reference.get(Rxxx) == null && Rxxx_to_oldReferences.get(Rxxx) == null) {
1088                             break;
1089                         }
1090                     }
1091                 }
1092                 // add to new references
1093                 reference_to_Rxxx.put(p, Rxxx);
1094                 Rxxx_to_reference.put(Rxxx, p);
1095             }
1096             // references="R034"
1097             return " references=\"" + Rxxx + "\"";
1098         }
1099 
getReferenceHTML(String Rxxx)1100         String getReferenceHTML(String Rxxx) {
1101             Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails.
1102             String uri = p.getFirst();
1103             String value = p.getSecond();
1104             uri = uri == null ? "" : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\"";
1105             value = value == null ? "[missing]" : TransliteratorUtilities.toHTML.transliterate(value);
1106             return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>";
1107         }
1108 
printReferences(PrintWriter out)1109         void printReferences(PrintWriter out) {
1110             // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference>
1111             out.println("\t<references>");
1112             for (String Rxxx : Rxxx_to_reference.keySet()) {
1113                 out.println(getReferenceHTML(Rxxx));
1114             }
1115             out.println("\t</references>");
1116         }
1117     }
1118 
1119     static References references = new References();
1120 
getExcelData(List<String> failures, Map<String, RowData> localeToRowData)1121     private static Set<RowData> getExcelData(List<String> failures, Map<String, RowData> localeToRowData)
1122         throws IOException {
1123 
1124         LanguageTagParser ltp = new LanguageTagParser();
1125 
1126         String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/";
1127         final String ricksFile = "country_language_population_raw.txt";
1128         System.out.println("\n# Problems in " + ricksFile + "\n");
1129         List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data(ricksFile));
1130 
1131         Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language");
1132 
1133         Set<String> territories = new TreeSet<>(sc.getGoodAvailableCodes("territory"));
1134         territories.removeAll(supplementalData.getContainers());
1135         territories.remove("EU");
1136         territories.remove("QO");
1137 
1138         Set<String> countriesNotFound = new TreeSet<>(territories);
1139         Set<OfficialStatus> statusFound = new TreeSet<>();
1140         Set<String> countriesWithoutOfficial = new TreeSet<>(territories);
1141         countriesWithoutOfficial.remove("ZZ");
1142 
1143         Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<>();
1144 
1145         Set<String> languagesNotFound = new TreeSet<>(languages);
1146         Set<RowData> sortedInput = new TreeSet<>();
1147         int count = 0;
1148         for (List<String> row : input) {
1149             ++count;
1150             if (count == 1 || row.size() <= COUNTRY_GDP) {
1151                 failures.add(join(row, "\t") + "\tShort row");
1152                 continue;
1153             }
1154             try {
1155                 RowData x = new RowData(row);
1156                 if (x.officialStatus.isOfficial()) {
1157                     Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(x.countryCode);
1158                     if (largestOffical == null) {
1159                         countryToLargestOfficialLanguage.put(x.countryCode,
1160                             Row.of(x.languageCode, x.languagePopulation));
1161                     } else if (largestOffical.get1() < x.languagePopulation) {
1162                         largestOffical.set0(x.languageCode);
1163                         largestOffical.set1(x.languagePopulation);
1164                     }
1165                 }
1166                 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) {
1167                     countriesWithoutOfficial.remove(x.countryCode);
1168                 }
1169                 if (!checkCode(LstrType.region, x.countryCode, row)) continue;
1170                 statusFound.add(x.officialStatus);
1171                 countriesNotFound.remove(x.countryCode);
1172                 languagesNotFound.remove(x.languageCode);
1173                 if (x.languageCode.contains("_")) {
1174                     ltp.set(x.languageCode);
1175                     languagesNotFound.remove(ltp.getLanguage());
1176                     if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue;
1177                     if (!checkCode(LstrType.script, ltp.getScript(), row)) continue;
1178                 }
1179                 String locale = x.languageCode + "_" + x.countryCode;
1180                 if (localeToRowData.get(locale) != null) {
1181                     BadItem.ERROR.show("duplicate data", x.languageCode + " with " + x.countryCode, row);
1182                 }
1183                 localeToRowData.put(locale, x);
1184                 sortedInput.add(x);
1185             } catch (ParseException e) {
1186                 failures.add(join(row, "\t") + "\t" + e.getMessage() + "\t"
1187                     + join(Arrays.asList(e.getStackTrace()), ";\t"));
1188             } catch (RuntimeException e) {
1189                 throw (RuntimeException) new IllegalArgumentException("Failure on line " + count + ")\t" + row)
1190                     .initCause(e);
1191             }
1192         }
1193         // System.out.println("Note: the following Status values were found in the data: " +
1194         // CldrUtility.join(statusFound, " | "));
1195 
1196         // make sure we have something
1197         for (String country : countriesNotFound) {
1198             RowData x = new RowData(country, "und");
1199             sortedInput.add(x);
1200         }
1201         for (String language : languagesNotFound) {
1202             RowData x = new RowData("ZZ", language);
1203             sortedInput.add(x);
1204         }
1205 
1206         for (RowData row : sortedInput) {
1207             // see which countries have languages that are larger than any offical language
1208 
1209             if (!row.officialStatus.isOfficial()) {
1210                 //String country = row.countryCode;
1211                 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(row.countryCode);
1212                 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) {
1213                     BadItem.WARNING.show("language population > all official languages", getLanguageCodeAndName(largestOffical.get0()), row.toString(true));
1214                 }
1215             }
1216 
1217             // see which countries are missing an official language
1218             if (!countriesWithoutOfficial.contains(row.countryCode)) continue;
1219             BadItem.ERROR.show("missing official language", row.getCountryName() + "\t" + row.countryCode, row.toString(true));
1220             countriesWithoutOfficial.remove(row.countryCode);
1221         }
1222 
1223         // write out file for rick
1224         PrintWriter log = FileUtilities.openUTF8Writer(dir, ricksFile);
1225         log.println(
1226             "*\tCName" +
1227                 "\tCCode" +
1228                 "\tCPopulation" +
1229                 "\tCLiteracy" +
1230                 "\tCGdp" +
1231                 "\tOfficialStatus" +
1232                 "\tLanguage" +
1233                 "\tLCode" +
1234                 "\tLPopulation" +
1235                 "\tWritingPop" +
1236                 "\tReferences" +
1237                 "\tNotes");
1238         RickComparator rickSorting = new RickComparator();
1239         Set<RowData> rickSorted = new TreeSet<>(rickSorting);
1240         rickSorted.addAll(sortedInput);
1241 
1242         for (RowData row : rickSorted) {
1243             final String langLit = row.getLanguageLiteracyString();
1244             final String countryLit = row.getCountryLiteracyString();
1245             log.println(
1246                 row.getCountryName()
1247                     + "\t" + row.countryCode
1248                     + "\t" + row.getCountryPopulationString()
1249                     + "\t" + countryLit
1250                     + "\t" + row.getCountryGdpString()
1251                     + "\t" + (row.officialStatus == OfficialStatus.unknown ? "" : row.officialStatus)
1252                     + "\t" + row.getRickLanguageName()
1253                     + "\t" + row.getRickLanguageCode()
1254                     + "\t" + row.getLanguagePopulationString()
1255                     + "\t" + (langLit.equals(countryLit) ? "" : langLit)
1256                     + "\t" + getExcelQuote(row.comment)
1257                     + "\t" + getExcelQuote(row.notes));
1258         }
1259         log.close();
1260         return sortedInput;
1261     }
1262 
getCldrParents(Set<String> available)1263     private static Set<String> getCldrParents(Set<String> available) {
1264         LanguageTagParser ltp2 = new LanguageTagParser();
1265         Set<String> cldrParents = new TreeSet<>();
1266         for (String locale : available) {
1267             if (skipLocales.contains(locale)) continue;
1268             try {
1269                 ltp2.set(locale);
1270             } catch (RuntimeException e) {
1271                 System.out.println("Skipping CLDR file: " + locale);
1272                 continue;
1273             }
1274             String locale2 = ltp2.getLanguageScript();
1275             if (locale2.equals("sh")) continue;
1276             // int lastPos = locale.lastIndexOf('_');
1277             // if (lastPos < 0) continue;
1278             // String locale2 = locale.substring(0,lastPos);
1279             cldrParents.add(locale2);
1280             languageToMaxCountry.put(locale2, null);
1281         }
1282         //System.out.println("CLDR Parents: " + cldrParents);
1283         return cldrParents;
1284     }
1285 
showFailures(List<String> failures)1286     private static void showFailures(List<String> failures) {
1287         if (failures.size() <= 1) {
1288             return;
1289         }
1290         System.out.println();
1291         System.out.println("Failures in Output");
1292         System.out.println();
1293 
1294         System.out.println(RowData.toStringHeader());
1295         for (String failure : failures) {
1296             System.out.println(failure);
1297         }
1298     }
1299 
getProcessedParent(String localeCode)1300     public static String getProcessedParent(String localeCode) {
1301         if (localeCode == null || localeCode.equals("root")) return null;
1302         int pos = localeCode.lastIndexOf('_');
1303         if (pos < 0) return "root";
1304         LanguageTagParser ltp = new LanguageTagParser();
1305         String script = ltp.set(localeCode).getScript();
1306         if (script.length() == 0) {
1307             return getFullyResolved(localeCode);
1308         }
1309         return localeCode.substring(0, pos);
1310     }
1311 
getFullyResolved(String languageCode)1312     private static String getFullyResolved(String languageCode) {
1313         String result = defaultContent.get(languageCode);
1314         if (result != null) return result;
1315         // we missed. Try taking parent and trying again
1316         int pos = languageCode.length() + 1;
1317         while (true) {
1318             pos = languageCode.lastIndexOf('_', pos - 1);
1319             if (pos < 0) {
1320                 return "***" + languageCode;
1321             }
1322             result = defaultContent.get(languageCode.substring(0, pos));
1323             if (result != null) {
1324                 LanguageTagParser ltp = new LanguageTagParser().set(languageCode);
1325                 LanguageTagParser ltp2 = new LanguageTagParser().set(result);
1326                 String region = ltp.getRegion();
1327                 if (region.length() == 0) {
1328                     ltp.setRegion(ltp2.getRegion());
1329                 }
1330                 String script = ltp.getScript();
1331                 if (script.length() == 0) {
1332                     ltp.setScript(ltp2.getScript());
1333                 }
1334                 return ltp.toString();
1335             }
1336         }
1337     }
1338 
1339     static Comparator<Iterable> firstElementComparator = new Comparator<Iterable>() {
1340         @Override
1341         public int compare(Iterable o1, Iterable o2) {
1342             int result = ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next()));
1343             assert result != 0;
1344             return result;
1345         }
1346     };
1347 
showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1348     private static void showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent,
1349         Map<String, RowData> localeToRowData,
1350         Set<String> defaultLocaleContent) {
1351 
1352         if (SHOW_OLD_DEFAULT_CONTENTS) {
1353             System.out.println();
1354             System.out.println("Computing Defaults Contents");
1355             System.out.println();
1356         }
1357 
1358         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
1359         Set<String> locales = new TreeSet<>(cldrFactory.getAvailable());
1360         LocaleIDParser lidp = new LocaleIDParser();
1361 
1362         // add all the combinations of language, script, and territory.
1363         for (String locale : localeToRowData.keySet()) {
1364             String baseLanguage = lidp.set(locale).getLanguage();
1365             if (locales.contains(baseLanguage) && !locales.contains(locale)) {
1366                 locales.add(locale);
1367                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale);
1368             }
1369         }
1370 
1371         // adding parents
1372         Set<String> toAdd = new TreeSet<>();
1373         while (true) {
1374             for (String locale : locales) {
1375                 String newguy = LocaleIDParser.getParent(locale);
1376                 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) {
1377                     toAdd.add(newguy);
1378                     if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy);
1379                 }
1380             }
1381             if (toAdd.size() == 0) {
1382                 break;
1383             }
1384             locales.addAll(toAdd);
1385             toAdd.clear();
1386         }
1387 
1388         // get sets of siblings
1389         Set<Set<String>> siblingSets = new TreeSet<>(firstElementComparator);
1390         Set<String> needsADoin = new TreeSet<>(locales);
1391 
1392         Set<String> deprecatedLanguages = new TreeSet<>();
1393         deprecatedLanguages.add("sh");
1394         Set<String> deprecatedRegions = new TreeSet<>();
1395         deprecatedRegions.add("YU");
1396         deprecatedRegions.add("CS");
1397         deprecatedRegions.add("ZZ");
1398 
1399         // first find all the language subtags that have scripts, and those we need to skip. Those are aliased-only
1400         Set<String> skippingItems = new TreeSet<>();
1401         Set<String> hasAScript = new TreeSet<>();
1402         //Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language);
1403         for (String locale : locales) {
1404             lidp.set(locale);
1405             if (lidp.getScript().length() != 0) {
1406                 hasAScript.add(lidp.getLanguage());
1407             }
1408             Set<LocaleIDParser.Level> levels = lidp.getLevels();
1409             // must have no variants, must have either script or region, no deprecated elements
1410             if (levels.contains(LocaleIDParser.Level.Variants) // no variants
1411                 || !(levels.contains(LocaleIDParser.Level.Script)
1412                     || levels.contains(LocaleIDParser.Level.Region))
1413                 || deprecatedLanguages.contains(lidp.getLanguage())
1414                 || deprecatedRegions.contains(lidp.getRegion())) {
1415                 // skip language-only locales, and ones with variants
1416                 needsADoin.remove(locale);
1417                 skippingItems.add(locale);
1418                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale);
1419                 continue;
1420             }
1421         }
1422         // walk through the locales, getting the ones we care about.
1423         Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<>();
1424 
1425         for (String locale : new TreeSet<>(needsADoin)) {
1426             if (!needsADoin.contains(locale)) continue;
1427             lidp.set(locale);
1428             Set<Level> level = lidp.getLevels();
1429             // skip locales that need scripts and don't have them
1430             if (!level.contains(LocaleIDParser.Level.Script) // no script
1431                 && hasAScript.contains(lidp.getLanguage())) {
1432                 needsADoin.remove(locale);
1433                 skippingItems.add(locale);
1434                 continue;
1435             }
1436             // get siblings
1437             Set<String> siblingSet = lidp.getSiblings(needsADoin);
1438             // if it has a script and region
1439             if (level.contains(LocaleIDParser.Level.Script) && level.contains(LocaleIDParser.Level.Region)) {
1440                 double languageLiteratePopulation = 0;
1441                 for (String localeID2 : siblingSet) {
1442                     RowData rowData = localeToRowData.get(localeID2);
1443                     if (rowData != null) {
1444                         languageLiteratePopulation += rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1445                     }
1446                 }
1447                 String parentID = LocaleIDParser.getParent(locale);
1448                 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation);
1449             }
1450 
1451             try {
1452                 siblingSets.add(siblingSet);
1453             } catch (RuntimeException e) {
1454                 e.printStackTrace();
1455             }
1456             needsADoin.removeAll(siblingSet);
1457         }
1458         if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("ConvertLanguageData Skipping: " + skippingItems);
1459         if (needsADoin.size() != 0) {
1460             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin);
1461         }
1462 
1463         // walk through the data
1464         Set<String> skippingSingletons = new TreeSet<>();
1465 
1466         Set<String> missingData = new TreeSet<>();
1467         for (Set<String> siblingSet : siblingSets) {
1468             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet);
1469 
1470             if (false & siblingSet.size() == 1) {
1471                 skippingSingletons.add(siblingSet.iterator().next());
1472                 continue;
1473             }
1474             // get best
1475             double best = Double.NEGATIVE_INFINITY;
1476             String bestLocale = "???";
1477             Set<Pair<Double, String>> data = new TreeSet<>();
1478             LanguageTagParser ltp = new LanguageTagParser();
1479             for (String locale : siblingSet) {
1480                 RowData rowData = localeToRowData.get(locale);
1481                 double languageLiteratePopulation = -1;
1482                 if (rowData != null) {
1483                     languageLiteratePopulation = rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1484                 } else {
1485                     Double d = scriptLocaleToLanguageLiteratePopulation.get(locale);
1486                     if (d != null) {
1487                         languageLiteratePopulation = d;
1488                     } else {
1489                         final String region = ltp.set(locale).getRegion();
1490                         if (region.isEmpty() || StandardCodes.isCountry(region)) {
1491                             missingData.add(locale);
1492                         }
1493                     }
1494                 }
1495                 data.add(new Pair<>(languageLiteratePopulation, locale));
1496                 if (best < languageLiteratePopulation) {
1497                     best = languageLiteratePopulation;
1498                     bestLocale = locale;
1499                 }
1500             }
1501             // show it
1502             for (Pair<Double, String> datum : data) {
1503                 if (SHOW_OLD_DEFAULT_CONTENTS)
1504                     System.out.format(
1505                         "\tContenders: %s %f (based on literate population)" + CldrUtility.LINE_SEPARATOR,
1506                         datum.getSecond(), datum.getFirst());
1507             }
1508             // System.out.format("\tPicking default content: %s %f (based on literate population)" +
1509             // Utility.LINE_SEPARATOR, bestLocale, best);
1510             // Hack to fix English
1511             // TODO Generalize in the future for other locales with non-primary scripts
1512             if (bestLocale.startsWith("en_")) {
1513                 defaultLocaleContent.add("en_US");
1514             } else {
1515                 defaultLocaleContent.add(bestLocale);
1516             }
1517         }
1518 
1519         for (String singleton : skippingSingletons) {
1520             BadItem.WARNING.show("skipping Singletons", singleton);
1521         }
1522         for (String missing : missingData) {
1523             BadItem.WARNING.show("Missing Data", missing);
1524         }
1525 
1526         // LanguageTagParser ltp = new LanguageTagParser();
1527         // Set<String> warnings = new LinkedHashSet();
1528         // for (String languageCode : languageToMaxCountry.keySet()) {
1529         // CodeAndPopulation best = languageToMaxCountry.get(languageCode);
1530         // String languageSubtag = ltp.set(languageCode).getLanguage();
1531         // String countryCode = "ZZ";
1532         // double rawLanguagePopulation = -1;
1533         // if (best != null) {
1534         // countryCode = best.code;
1535         // rawLanguagePopulation = best.population;
1536         // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag);
1537         // if (regions == null || !regions.contains(countryCode)) {
1538         // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag);
1539         // if (regions2 == null || !regions2.contains(countryCode)) {
1540         // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + "/" + regions2);
1541         // }
1542         // }
1543         // }
1544         // String resolvedLanguageCode = languageCode + "_" + countryCode;
1545         // ltp.set(languageCode);
1546         // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode);
1547         // String script = ltp.getScript();
1548         // if (script.length() == 0) {
1549         // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode);
1550         // if (bestScript != null) {
1551         // script = bestScript.code;
1552         // if (scripts == null || !scripts.contains(script)) {
1553         // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts);
1554         // }
1555         // } else {
1556         // script = "Zzzz";
1557         // if (scripts == null) {
1558         // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode);
1559         // }
1560         // if (scripts != null) {
1561         // script = scripts.iterator().next();
1562         // if (scripts.size() != 1) {
1563         // warnings.add("WARNING: " + languageCode + " => " + scripts);
1564         // }
1565         // }
1566         // }
1567         // if (scripts == null) {
1568         // warnings.add("Missing scripts for: " + languageCode);
1569         // } else if (scripts.size() == 1){
1570         // script = "";
1571         // }
1572         // resolvedLanguageCode = languageCode
1573         // + (script.length() == 0 ? "" : "_" + script)
1574         // + "_" + countryCode;
1575         // }
1576         //
1577         //
1578         // System.out.println(
1579         // resolvedLanguageCode
1580         // + "\t" + languageCode
1581         // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH)
1582         // + "\t" + countryCode
1583         // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH)
1584         // + "\t" + formatNumber(rawLanguagePopulation)
1585         // + (cldrParents.contains(languageCode) ? "\tCLDR" : "")
1586         // );
1587         // if (languageCode.length() == 0) continue;
1588         // defaultContent.put(languageCode, resolvedLanguageCode);
1589         // }
1590         // for (String warning : warnings) {
1591         // System.out.println(warning);
1592         // }
1593     }
1594 
1595     // private static void printDefaultContent(Set<String> defaultLocaleContent) {
1596     // String sep = Utility.LINE_SEPARATOR + "\t\t\t";
1597     // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, PatternCache.get("(\\S)\\S*").matcher(""),
1598     // 80);
1599     //
1600     // Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
1601     // Log.println("\t\t/>");
1602     // }
1603 
getSuppressScript(String languageCode)1604     private static Object getSuppressScript(String languageCode) {
1605         // TODO Auto-generated method stub
1606         return null;
1607     }
1608 
join(Collection c, String separator)1609     public static String join(Collection c, String separator) {
1610         StringBuffer result = new StringBuffer();
1611         boolean first = true;
1612         for (Object x : c) {
1613             if (first)
1614                 first = false;
1615             else
1616                 result.append(separator);
1617             result.append(x);
1618         }
1619         return result.toString();
1620     }
1621 
addBestRegion(String languageCode, String countryCode, double languagePopulationRaw)1622     private static void addBestRegion(String languageCode, String countryCode, double languagePopulationRaw) {
1623         addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry);
1624     }
1625 
addBestScript(String languageCode, String scriptCode, double languagePopulationRaw)1626     private static void addBestScript(String languageCode, String scriptCode, double languagePopulationRaw) {
1627         addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript);
1628     }
1629 
addBest(String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1630     private static void addBest(String languageCode, double languagePopulationRaw, String code,
1631         Map<String, CodeAndPopulation> languageToMaxCode) {
1632         if (languageCode.length() == 0) {
1633             throw new IllegalArgumentException();
1634         }
1635         CodeAndPopulation best = languageToMaxCode.get(languageCode);
1636         if (best == null) {
1637             languageToMaxCode.put(languageCode, best = new CodeAndPopulation());
1638         } else if (best.population >= languagePopulationRaw) {
1639             return;
1640         }
1641         best.population = languagePopulationRaw;
1642         best.code = code;
1643     }
1644 
1645     static class CodeAndPopulation {
1646         String code = null;
1647         double population = Double.NaN;
1648 
1649         @Override
toString()1650         public String toString() {
1651             return "{" + code + "," + population + "}";
1652         }
1653     }
1654 
1655     static public class GeneralCollator implements Comparator<String> {
1656         static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0);
1657         static RuleBasedCollator UCA = (RuleBasedCollator) Collator
1658             .getInstance(ULocale.ROOT);
1659         static {
1660             UCA.setNumericCollation(true);
1661         }
1662 
1663         @Override
compare(String s1, String s2)1664         public int compare(String s1, String s2) {
1665             if (s1 == null) {
1666                 return s2 == null ? 0 : -1;
1667             } else if (s2 == null) {
1668                 return 1;
1669             }
1670             int result = UCA.compare(s1, s2);
1671             if (result != 0) return result;
1672             return cpCompare.compare(s1, s2);
1673         }
1674     }
1675 
1676     public static class InverseComparator<T> implements Comparator<T> {
1677         private Comparator<T> other;
1678 
InverseComparator()1679         public InverseComparator() {
1680             this.other = null;
1681         }
1682 
InverseComparator(Comparator<T> other)1683         public InverseComparator(Comparator<T> other) {
1684             this.other = other;
1685         }
1686 
1687         @Override
compare(T a, T b)1688         public int compare(T a, T b) {
1689             return other == null
1690                 ? ((Comparable) b).compareTo(a)
1691                 : other.compare(b, a);
1692         }
1693     }
1694 
1695     static Set<String> languagesNeeded = new TreeSet<>(
1696         Arrays
1697             .asList("ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na no rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv"
1698                 .split("\\s")));
1699 
1700     /**
1701      * Not called?
1702      */
1703     @Deprecated
generateIso639_2Data(PrintWriter out)1704     private static void generateIso639_2Data(PrintWriter out) {
1705         for (String languageSubtag : sc.getAvailableCodes("language")) {
1706             String alpha3 = Iso639Data.toAlpha3(languageSubtag);
1707             Type type = Iso639Data.getType(languageSubtag);
1708             Scope scope = Iso639Data.getScope(languageSubtag);
1709             if (type != null || alpha3 != null || scope != null) {
1710                 out.println("\t\t<languageCode type=\"" + languageSubtag + "\"" +
1711                     (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") +
1712                     (type == null ? "" : " iso639Type=\"" + type + "\"") +
1713                     (scope == null ? "" : " iso639Scope=\"" + scope + "\"") +
1714                     "/>");
1715             }
1716 
1717         }
1718     }
1719 
1720     static Relation<String, BasicLanguageData> language2BasicLanguageData = Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class);
1721 
1722     static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts;
1723     static Map<Pair<String, String>, String> language_script_references = new TreeMap<>();
1724 
1725     static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = SupplementalDataInfo
1726         .getInstance().getLocaleAliasInfo();
1727 
getLanguage2Scripts(Set<RowData> sortedInput)1728     static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException {
1729         language_status_scripts = new TreeMap<>();
1730 
1731         // // get current scripts
1732         // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), TreeSet.class);
1733         // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), TreeSet.class);
1734         // for (String languageSubtag : language2BasicLanguageData.keySet()) {
1735         // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
1736         // for (String script : item.getScripts()) {
1737         // addLanguage2Script(languageSubtag, item.getType(), script);
1738         // }
1739         // }
1740         // }
1741         // System.out.println("Language 2 scripts: " + language_status_scripts);
1742 
1743         // #Lcode LanguageName Status Scode ScriptName References
1744         List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data("language_script_raw.txt"));
1745         System.out.println(CldrUtility.LINE_SEPARATOR + "# Problems in language_script_raw.txt"
1746             + CldrUtility.LINE_SEPARATOR);
1747         //int count = -1;
1748         for (List<String> row : input) {
1749             try {
1750                 if (row.size() == 0) continue;
1751                 //++count;
1752                 String language = row.get(0).trim();
1753                 if (language.length() == 0 || language.startsWith("#")) continue;
1754                 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2));
1755                 String scripts = row.get(3);
1756                 if (!checkCode(LstrType.language, language, row)) continue;
1757                 for (String script : scripts.split("\\s+")) {
1758                     if (!checkCode(LstrType.script, script, row)) continue;
1759                     // if the script is not modern, demote
1760                     Info scriptInfo = ScriptMetadata.getInfo(script);
1761                     if (scriptInfo == null) {
1762                         BadItem.ERROR.toString("illegal script; must be represented in Unicode, remove line or fix", script, row);
1763                         continue;
1764                     }
1765                     IdUsage idUsage = scriptInfo.idUsage;
1766                     if (status == BasicLanguageData.Type.primary && idUsage != IdUsage.RECOMMENDED) {
1767                         if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) {
1768                             BadItem.WARNING.toString("Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge",
1769                                 idUsage + ", " + script + "=" + getULocaleScriptName(script), row);
1770                         } else {
1771                             BadItem.ERROR.toString("Script is not modern; make secondary", idUsage + ", " + script + "=" + getULocaleScriptName(script), row);
1772                             status = BasicLanguageData.Type.secondary;
1773                         }
1774                     }
1775 
1776                     // if the language is not modern, demote
1777                     if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) {
1778                         BadItem.ERROR.toString("Remove/Change deprecated language", language + " "
1779                             + getLanguageName(language) + "; " + LOCALE_ALIAS_INFO.get("language").get(language), row);
1780                         continue;
1781                     }
1782                     if (status == BasicLanguageData.Type.primary && !sc.isModernLanguage(language)) {
1783                         BadItem.ERROR.toString("Should be secondary, language is not modern", language + " " + getLanguageName(language), row);
1784                         status = BasicLanguageData.Type.secondary;
1785                     }
1786 
1787                     addLanguage2Script(language, status, script);
1788                     if (row.size() > 5) {
1789                         String reference = row.get(5);
1790                         if (reference != null && reference.length() == 0) {
1791                             language_script_references.put(new Pair<>(language, script), reference);
1792                         }
1793                     }
1794                 }
1795             } catch (RuntimeException e) {
1796                 System.err.println(row);
1797                 throw e;
1798             }
1799         }
1800 
1801         // System.out.println("Language 2 scripts: " + language_status_scripts);
1802 
1803         for (String language : sc.getGoodAvailableCodes("language")) {
1804             if (supplementalData.getDeprecatedInfo("language", language) != null) {
1805                 continue;
1806             }
1807             Map<String, String> registryData = sc.getLangData("language", language);
1808             if (registryData != null) {
1809                 String suppressScript = registryData.get("Suppress-Script");
1810                 if (suppressScript == null) continue;
1811                 if (ScriptMetadata.getInfo(suppressScript) == null) {
1812                     // skip, not represented in Unicode
1813                     continue;
1814                 }
1815                 // if there is something already there, we have a problem.
1816                 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1817                 if (status_scripts == null) {
1818                     System.out
1819                         .println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript);
1820                 } else if (!status_scripts.values().contains(suppressScript)) {
1821                     System.out.println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript
1822                         + "\tall:\t" + status_scripts.values());
1823                 } else {
1824                     // at this point, the suppressScript is in the union of the primary and secondary.
1825                     Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1826                     if (primaryScripts != null && !primaryScripts.contains(suppressScript)) {
1827                         System.out.println("Suppress-Script is not in primary: " + language + "\tSuppress-Script:\t"
1828                             + suppressScript + "\tprimary:\t"
1829                             + primaryScripts);
1830                     }
1831                 }
1832                 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript);
1833             }
1834         }
1835 
1836         // remove primaries from secondaries
1837         // check for primaries for scripts
1838         for (String language : language_status_scripts.keySet()) {
1839             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1840             Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary);
1841             if (secondaryScripts == null) continue;
1842             Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1843             if (primaryScripts == null) {
1844                 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts);
1845                 // status_scripts.removeAll(BasicLanguageData.Type.secondary);
1846                 if (sc.isModernLanguage(language)) {
1847                     BadItem.ERROR.show("modern language without primary script, might need to edit moribund_languages.txt", language + " "
1848                         + getLanguageName(language));
1849                 }
1850             } else {
1851                 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts);
1852             }
1853         }
1854 
1855         // check that every living language in the row data has a script
1856         Set<String> livingLanguagesWithTerritories = new TreeSet<>();
1857         for (RowData rowData : sortedInput) {
1858             String language = rowData.languageCode;
1859             if (sc.isModernLanguage(language) && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) {
1860                 livingLanguagesWithTerritories.add(language);
1861             }
1862         }
1863         for (String language : livingLanguagesWithTerritories) {
1864             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1865             if (status_scripts != null) {
1866                 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1867                 if (primaryScripts != null && primaryScripts.size() > 0) {
1868                     continue;
1869                 }
1870             }
1871             if (language.equals("tw")) continue; // TODO load aliases and check...
1872             BadItem.WARNING.show("ISO 639-1/2 language in language-territory list without primary script", language + "\t" + getLanguageName(language));
1873         }
1874 
1875         // System.out.println("Language 2 scripts: " + language_status_scripts);
1876     }
1877 
checkScript(String script)1878     private static boolean checkScript(String script) {
1879         // TODO Auto-generated method stub
1880         return false;
1881     }
1882 
1883     static Validity VALIDITY = Validity.getInstance();
1884 
checkCode(LstrType type, String code, List<String> sourceLine)1885     private static boolean checkCode(LstrType type, String code, List<String> sourceLine) {
1886         Status validity = VALIDITY.getCodeToStatus(type).get(code);
1887         if (validity == Status.regular) {
1888             return true;
1889         } else if (validity == Status.unknown && type == LstrType.region) {
1890             return true;
1891         }
1892         BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine);
1893         return false;
1894     }
1895 
addLanguage2Script(String language, BasicLanguageData.Type type, String script)1896     private static void addLanguage2Script(String language, BasicLanguageData.Type type, String script) {
1897         Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1898         if (status_scripts == null)
1899             language_status_scripts.put(language, status_scripts = Relation.of(new TreeMap<BasicLanguageData.Type, Set<String>>(), TreeSet.class));
1900         status_scripts.put(type, script);
1901     }
1902 
addLanguageScriptData()1903     static void addLanguageScriptData() throws IOException {
1904         // check to make sure that every language subtag is in 639-3
1905         Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language");
1906         // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes);
1907         // iso639_2_missing.removeAll(Iso639Data.getAvailable());
1908         // iso639_2_missing.remove("root");
1909         // if (iso639_2_missing.size() != 0) {
1910         // for (String missing : iso639_2_missing){
1911         // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", " +
1912         // sc.getData("language", missing));
1913         // }
1914         // }
1915 
1916         // Map<String, String> nameToTerritoryCode = new TreeMap();
1917         // for (String territoryCode : sc.getGoodAvailableCodes("territory")) {
1918         // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), territoryCode);
1919         // }
1920         // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); //
1921 
1922         //BasicLanguageData languageData = new BasicLanguageData();
1923 
1924         BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt");
1925         while (true) {
1926             String line = in.readLine();
1927             if (line == null) break;
1928             String[] parts = line.split("\\t");
1929             String alpha3 = parts[0];
1930             alpha3 = stripBrackets(alpha3);
1931             String languageSubtag = Iso639Data.fromAlpha3(alpha3);
1932             if (languageSubtag == null) {
1933                 if (langRegistryCodes.contains(alpha3)) {
1934                     languageSubtag = alpha3;
1935                 } else {
1936                     BadItem.WARNING.show("Language subtag not found on line", alpha3, line);
1937                     continue;
1938                 }
1939             }
1940             //String name = parts[1];
1941             Set<String> names = Iso639Data.getNames(languageSubtag);
1942             if (names == null) {
1943                 Map<String, String> name2 = sc.getLangData("language", languageSubtag);
1944                 if (name2 != null) {
1945                     String name3 = name2.get("Description");
1946                     if (name3 != null) {
1947                         names = new TreeSet<>();
1948                         names.add(name3);
1949                     }
1950                 }
1951             }
1952             // if (names == null || !names.contains(name)) {
1953             // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + names);
1954             // }
1955 
1956             // names all straight, now get scripts and territories
1957             // [Cyrl]; [Latn]
1958             Set<String> fullScriptList = sc.getGoodAvailableCodes("script");
1959 
1960             String[] scriptList = parts[2].split("[;,]\\s*");
1961             Set<String> scripts = new TreeSet<>();
1962             Set<String> scriptsAlt = new TreeSet<>();
1963             for (String script : scriptList) {
1964                 if (script.length() == 0) continue;
1965                 boolean alt = false;
1966                 if (script.endsWith("*")) {
1967                     alt = true;
1968                     script = script.substring(0, script.length() - 1);
1969                 }
1970                 script = stripBrackets(script);
1971                 if (!fullScriptList.contains(script)) {
1972                     System.out.println("Script <" + script + "> for <" + languageSubtag + "> not found in "
1973                         + fullScriptList);
1974                 } else if (alt) {
1975                     scriptsAlt.add(script);
1976                 } else {
1977                     scripts.add(script);
1978                 }
1979             }
1980             // now territories
1981             Set<String> territories = new TreeSet<>();
1982             if (parts.length > 4) {
1983                 String[] territoryList = parts[4].split("\\s*[;,-]\\s*");
1984                 for (String territoryName : territoryList) {
1985                     if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue;
1986                     String territoryCode = CountryCodeConverter.getCodeFromName(territoryName, true);
1987                     if (territoryCode == null) {
1988                         BadItem.ERROR.show("no name found for territory", "<" + territoryName + ">", languageSubtag);
1989                     } else {
1990                         territories.add(territoryCode);
1991                     }
1992                 }
1993             }
1994             // <language type="de" scripts="Latn" territories="IT" alt="secondary"/>
1995             // we're going to go ahead and set these all to secondary.
1996             if (scripts.size() != 0) {
1997                 language2BasicLanguageData.put(languageSubtag,
1998                     new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scripts)
1999                         .setTerritories(territories));
2000             }
2001             if (scriptsAlt.size() != 0) {
2002                 language2BasicLanguageData.put(languageSubtag,
2003                     new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scriptsAlt)
2004                         .setTerritories(territories));
2005             }
2006         }
2007         in.close();
2008 
2009         // add other data
2010         for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) {
2011             Set<BasicLanguageData> otherData = supplementalData.getBasicLanguageData(languageSubtag);
2012             language2BasicLanguageData.putAll(languageSubtag, otherData);
2013         }
2014     }
2015 
2016     // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> language2basicData, String
2017     // comment) {
2018     // // now print
2019     // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class);
2020     // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class);
2021     //
2022     // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->"));
2023     //
2024     // for (String languageSubtag : language2basicData.keySet()) {
2025     // String duplicate = "";
2026     // // script,territory
2027     // primaryCombos.clear();
2028     // secondaryCombos.clear();
2029     //
2030     // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) {
2031     // Set<String> scripts = item.getScripts();
2032     // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" }));
2033     // for (String script : scripts) {
2034     // Set<String> territories = item.getTerritories();
2035     // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" }));
2036     // for (String territory : territories) {
2037     // if (item.getType().equals(BasicLanguageData.Type.primary)) {
2038     // primaryCombos.put(script, territory);
2039     // } else {
2040     // secondaryCombos.put(script, territory);
2041     // }
2042     // }
2043     // }
2044     // }
2045     // secondaryCombos.removeAll(primaryCombos);
2046     // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary);
2047     // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(),
2048     // BasicLanguageData.Type.secondary);
2049     // // System.out.println(item.toString(languageSubtag) + duplicate);
2050     // // duplicate = " <!-- " + "**" + " -->";
2051     // }
2052     // Log.println("\t</languageData>");
2053     // }
2054 
showBasicLanguageData(PrintWriter out, String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2055     private static void showBasicLanguageData(PrintWriter out, String languageSubtag, Relation<String, String> primaryCombos,
2056         Set<String> suppressEmptyScripts, BasicLanguageData.Type type) {
2057         Set<String> scriptsWithSameTerritories = new TreeSet<>();
2058         Set<String> lastTerritories = Collections.emptySet();
2059         for (String script : primaryCombos.keySet()) {
2060             Set<String> territories = primaryCombos.getAll(script);
2061             if (lastTerritories == Collections.EMPTY_SET) {
2062                 // skip first
2063             } else if (lastTerritories.equals(territories)) {
2064                 scriptsWithSameTerritories.add(script);
2065             } else {
2066                 showBasicLanguageData2(out, languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts,
2067                     lastTerritories, type);
2068                 scriptsWithSameTerritories.clear();
2069             }
2070             lastTerritories = territories;
2071             scriptsWithSameTerritories.add(script);
2072         }
2073         showBasicLanguageData2(out, languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, lastTerritories, type);
2074     }
2075 
showBasicLanguageData2(PrintWriter out, String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2076     private static void showBasicLanguageData2(PrintWriter out, String languageSubtag, Set<String> scripts,
2077         Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type) {
2078         scripts.remove("Zzzz");
2079         territories.remove("ZZ");
2080         if (territories.size() == 0 && suppressEmptyScripts != null) {
2081             scripts.removeAll(suppressEmptyScripts);
2082         }
2083         if (scripts.size() == 0 && territories.size() == 0) return;
2084         out.println("\t\t<language type=\"" + languageSubtag + "\"" +
2085             (scripts.size() == 0 ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") +
2086             (territories.size() == 0 ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") +
2087             (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") +
2088             "/>");
2089     }
2090 
2091     /*
2092      * System.out.println(
2093      * "\t\t<language type=\"" + languageSubtag + "\"" +
2094      * " scripts=\"" + Utility.join(scripts," ") + "\"" +
2095      * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") +
2096      * "/>"
2097      * );
2098      */
2099 
stripBrackets(String alpha3)2100     private static String stripBrackets(String alpha3) {
2101         if (alpha3.startsWith("[") && alpha3.endsWith("]")) {
2102             alpha3 = alpha3.substring(1, alpha3.length() - 1);
2103         }
2104         return alpha3;
2105     }
2106 
2107     static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
2108     static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH);
2109     static {
2110         nf_no_comma.setGroupingUsed(false);
2111     }
2112     static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH);
2113 
formatNumber(double original, int roundDigits, boolean xml)2114     public static String formatNumber(double original, int roundDigits, boolean xml) {
2115         double d = original;
2116         if (roundDigits != 0) {
2117             d = CldrUtility.roundToDecimals(original, roundDigits);
2118         }
2119         if (Double.isNaN(d)) {
2120             d = CldrUtility.roundToDecimals(original, roundDigits);
2121             throw new IllegalArgumentException("Double is NaN");
2122         }
2123         if (xml) {
2124             return nf_no_comma.format(d);
2125         }
2126         return nf.format(d);
2127     }
2128 
formatPercent(double d, int roundDigits, boolean xml)2129     public static String formatPercent(double d, int roundDigits, boolean xml) {
2130         if (roundDigits != 0) {
2131             d = CldrUtility.roundToDecimals(d, roundDigits);
2132         }
2133         if (xml) {
2134             nf_no_comma.setMaximumFractionDigits(roundDigits + 2);
2135             return nf_no_comma.format(d * 100.0);
2136         }
2137         pf.setMaximumFractionDigits(roundDigits + 2);
2138         return pf.format(d);
2139     }
2140 
2141     static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer();
2142 
fixLanguageCode(String languageCodeRaw, List<String> row)2143     private static String fixLanguageCode(String languageCodeRaw, List<String> row) {
2144         String languageCode = languageTagCanonicalizer.transform(languageCodeRaw);
2145         if (DEBUG && !languageCode.equals(languageCodeRaw)) {
2146             System.out.println("## " + languageCodeRaw + " => " + languageCode);
2147         }
2148         int bar = languageCode.indexOf('_');
2149         String script = "";
2150         if (bar >= 0) {
2151             script = languageCode.substring(bar);
2152             languageCode = languageCode.substring(0, bar);
2153         }
2154         R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("language").get(languageCode);
2155         if (replacement != null) {
2156             String replacementCode = replacement.get0().get(0);
2157             BadItem.ERROR.show("deprecated language code", languageCode + " => " + replacementCode, row);
2158             languageCode = replacementCode;
2159         }
2160         if (!sc.getAvailableCodes("language").contains(languageCode)) {
2161             BadItem.ERROR.show("bad language code", languageCode, row);
2162         }
2163         return languageCode + script;
2164     }
2165 
2166     enum BadItem {
2167         ERROR, WARNING, DETAIL;
2168 
show(String problem, String details, String... items)2169         void show(String problem, String details, String... items) {
2170             System.out.println(toString(problem, details, items));
2171         }
2172 
show(String problem, String details, List<String> row)2173         void show(String problem, String details, List<String> row) {
2174             System.out.println(toString(problem, details, row));
2175         }
2176 
toString(String problem, String details, String... items)2177         private String toString(String problem, String details, String... items) {
2178             return toString(problem, details, Arrays.asList(items));
2179         }
2180 
toString(String problem, String details, List<String> row)2181         private String toString(String problem, String details, List<String> row) {
2182             return "* " + this
2183                 + " *\t" + problem + ":"
2184                 + "\t" + details
2185                 + (row != null && row.size() > 0 ? "\t" + Joiner.on("\t").join(row) : "");
2186         }
2187     }
2188 
fixCountryCode(String countryCode, List<String> row)2189     private static String fixCountryCode(String countryCode, List<String> row) {
2190         R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("territory").get(countryCode);
2191         if (replacement != null) {
2192             String replacementCode = replacement.get0().get(0);
2193             BadItem.ERROR.show("deprecated territory code", countryCode + " => " + replacementCode, row);
2194             countryCode = replacementCode;
2195         }
2196         if (!sc.getAvailableCodes("territory").contains(countryCode)) {
2197             BadItem.ERROR.show("bad territory code", countryCode, row);
2198         }
2199         return countryCode;
2200     }
2201 
getULocaleLocaleName(String languageCode)2202     private static String getULocaleLocaleName(String languageCode) {
2203         return english.getName(languageCode, true);
2204         //return new ULocale(languageCode).getDisplayName();
2205     }
2206 
getULocaleScriptName(String scriptCode)2207     private static String getULocaleScriptName(String scriptCode) {
2208         return english.getName(CLDRFile.SCRIPT_NAME, scriptCode);
2209         // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH);
2210     }
2211 
getULocaleCountryName(String countryCode)2212     private static String getULocaleCountryName(String countryCode) {
2213         return english.getName(CLDRFile.TERRITORY_NAME, countryCode);
2214         //return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH);
2215     }
2216 }
2217