1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.nio.file.Files; 8 import java.text.ParseException; 9 import java.util.ArrayList; 10 import java.util.Arrays; 11 import java.util.Collection; 12 import java.util.Collections; 13 import java.util.Comparator; 14 import java.util.EnumMap; 15 import java.util.HashMap; 16 import java.util.HashSet; 17 import java.util.Iterator; 18 import java.util.LinkedHashSet; 19 import java.util.List; 20 import java.util.Map; 21 import java.util.Set; 22 import java.util.TreeMap; 23 import java.util.TreeSet; 24 import java.util.regex.Matcher; 25 26 import org.unicode.cldr.draft.FileUtilities; 27 import org.unicode.cldr.draft.ScriptMetadata; 28 import org.unicode.cldr.draft.ScriptMetadata.IdUsage; 29 import org.unicode.cldr.draft.ScriptMetadata.Info; 30 import org.unicode.cldr.util.Builder; 31 import org.unicode.cldr.util.CLDRFile; 32 import org.unicode.cldr.util.CLDRPaths; 33 import org.unicode.cldr.util.CldrUtility; 34 import org.unicode.cldr.util.Factory; 35 import org.unicode.cldr.util.Iso639Data; 36 import org.unicode.cldr.util.Iso639Data.Scope; 37 import org.unicode.cldr.util.Iso639Data.Source; 38 import org.unicode.cldr.util.Iso639Data.Type; 39 import org.unicode.cldr.util.LanguageTagCanonicalizer; 40 import org.unicode.cldr.util.LanguageTagParser; 41 import org.unicode.cldr.util.LocaleIDParser; 42 import org.unicode.cldr.util.LocaleIDParser.Level; 43 import org.unicode.cldr.util.Pair; 44 import org.unicode.cldr.util.PatternCache; 45 import org.unicode.cldr.util.SpreadSheet; 46 import org.unicode.cldr.util.StandardCodes; 47 import org.unicode.cldr.util.StandardCodes.LstrType; 48 import org.unicode.cldr.util.SupplementalDataInfo; 49 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 50 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 51 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 52 import org.unicode.cldr.util.TransliteratorUtilities; 53 import org.unicode.cldr.util.Validity; 54 import org.unicode.cldr.util.Validity.Status; 55 import org.unicode.cldr.util.XPathParts; 56 import org.unicode.cldr.util.XPathParts.Comments; 57 58 import com.google.common.base.Joiner; 59 import com.google.common.collect.ImmutableSet; 60 import com.google.common.math.DoubleMath; 61 import com.ibm.icu.impl.Relation; 62 import com.ibm.icu.impl.Row; 63 import com.ibm.icu.impl.Row.R2; 64 import com.ibm.icu.text.Collator; 65 import com.ibm.icu.text.NumberFormat; 66 import com.ibm.icu.text.RuleBasedCollator; 67 import com.ibm.icu.text.UTF16; 68 import com.ibm.icu.util.ULocale; 69 70 /** 71 * @author markdavis 72 * 73 */ 74 public class ConvertLanguageData { 75 76 private static final boolean DEBUG = false; 77 // change this if you need to override what is generated for the default contents. 78 private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+")); 79 80 public static final boolean SHOW_DIFF = false; 81 82 private static final boolean ALLOW_SMALL_NUMBERS = true; 83 84 static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator(); 85 static final Comparator<String> INVERSE_GENERAL = new InverseComparator<>(GENERAL_COLLATOR); 86 87 private static StandardCodes sc = StandardCodes.make(); 88 89 static final double populationFactor = 1; 90 static final double gdpFactor = 1; 91 static final int BAD_COUNTRY_NAME = 0, COUNTRY_CODE = 1, COUNTRY_POPULATION = 2, COUNTRY_LITERACY = 3, 92 COUNTRY_GDP = 4, OFFICIAL_STATUS = 5, BAD_LANGUAGE_NAME = 6, LANGUAGE_CODE = 7, LANGUAGE_POPULATION = 8, 93 LANGUAGE_LITERACY = 9, COMMENT = 10, NOTES = 11; 94 static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<>(); 95 static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<>(); 96 97 private static final double NON_OFFICIAL_WEIGHT = 0.40; 98 99 private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false; 100 101 private static final ImmutableSet<String> scriptAssumedLocales = ImmutableSet.of( 102 "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN"); 103 104 static Set<String> skipLocales = new HashSet<>( 105 Arrays 106 .asList( 107 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root" 108 .split("\\s"))); 109 110 static Map<String, String> defaultContent = new TreeMap<>(); 111 112 static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 113 static CLDRFile english = cldrFactory.make("en", true); 114 115 static SupplementalDataInfo supplementalData = SupplementalDataInfo 116 .getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY); 117 main(String[] args)118 public static void main(String[] args) throws IOException, ParseException { 119 final File oldSupp = new File(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml"); 120 final File genSupp = new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml"); 121 final File genLsraw = new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script_raw.txt"); 122 try ( 123 final BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSupp); 124 final PrintWriter newFile = FileUtilities.openUTF8Writer(genSupp); 125 final PrintWriter newLsraw = FileUtilities.openUTF8Writer(genLsraw); 126 ) { 127 // load elements we care about 128 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<languageData>\\s*"), newFile, false); 129 130 Set<String> available = cldrFactory.getAvailable(); 131 132 Set<String> cldrParents = getCldrParents(available); 133 134 List<String> failures = new ArrayList<>(); 135 Map<String, RowData> localeToRowData = new TreeMap<>(); 136 137 Set<RowData> sortedInput = getExcelData(failures, localeToRowData); 138 139 // get the locales (including parents) 140 Set<String> localesWithData = new TreeSet<>(localeToRowData.keySet()); 141 for (String locale : localeToRowData.keySet()) { 142 while (true) { 143 String parent = LocaleIDParser.getParent(locale); 144 if (parent == null) break; 145 localesWithData.add(parent); 146 locale = parent; 147 } 148 } 149 150 final LanguageTagParser languageTagParser = new LanguageTagParser(); 151 152 for (String localeRaw : available) { 153 String locale = languageTagCanonicalizer.transform(localeRaw); 154 if (!localesWithData.contains(locale)) { 155 CLDRFile locFile = cldrFactory.make(localeRaw, false); 156 if (locFile.isAliasedAtTopLevel()) { 157 continue; 158 } 159 if (scriptAssumedLocales.contains(locale)) { 160 continue; 161 } 162 languageTagParser.set(locale); 163 if (languageTagParser.getVariants().size() != 0) { 164 continue; 165 } 166 String withoutScript = languageTagParser.setScript("").toString(); 167 if (!localesWithData.contains(withoutScript)) { 168 String region = new LanguageTagParser().set(locale).getRegion(); 169 if (StandardCodes.isCountry(region)) { 170 BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale)); 171 } 172 } else { 173 // These exceptions are OK, because these locales by default use the non-default script 174 Set<String> OKExceptions = ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO"); 175 if (OKExceptions.contains(locale)) { 176 continue; 177 } 178 BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale) 179 + " but have data for " + getLanguageCodeAndName(withoutScript)); 180 } 181 } 182 } 183 184 // TODO sort by country code, then functionalPopulation, then language code 185 // and keep the top country for each language code (even if < 1%) 186 187 addLanguageScriptData(); 188 189 // showAllBasicLanguageData(allLanguageData, "old"); 190 getLanguage2Scripts(sortedInput); 191 192 writeNewBasicData2(newFile, sortedInput); 193 // writeNewBasicData(sortedInput); 194 195 writeTerritoryLanguageData(newFile, failures, sortedInput); 196 197 checkBasicData(localeToRowData); 198 199 Set<String> defaultLocaleContent = new TreeSet<>(); 200 201 showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent); 202 203 // showContent(available); 204 205 // certain items are overridden 206 207 List<String> toRemove = new ArrayList<>(); 208 for (String override : defaultOverrides) { 209 String replacement = getReplacement(override, defaultLocaleContent); 210 if (replacement != null) { 211 toRemove.add(replacement); 212 } 213 } 214 defaultLocaleContent.removeAll(toRemove); 215 defaultLocaleContent.addAll(defaultOverrides); 216 217 showFailures(failures); 218 219 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false); 220 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), newFile, false); 221 // generateIso639_2Data(newFile); 222 references.printReferences(newFile); 223 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false); 224 CldrUtility.copyUpTo(oldFile, null, newFile, false); 225 226 getLanguageScriptSpreadsheet(newLsraw); 227 } catch (Exception e) { 228 e.printStackTrace(); 229 } finally { 230 System.out.println("Wrote: " + genLsraw); 231 System.out.println("Wrote: " + genSupp); 232 System.out.println("Copying " + genSupp + " to " + oldSupp); 233 oldSupp.delete(); 234 Files.copy(genSupp.toPath(), oldSupp.toPath()); 235 System.out.println("DONE"); 236 } 237 } 238 getLanguageCodeAndName(String code)239 public static String getLanguageCodeAndName(String code) { 240 if (code == null) return null; 241 return english.getName(code) + " [" + code + "]"; 242 } 243 getReplacement(String oldDefault, Set<String> defaultLocaleContent)244 private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) { 245 String parent = LocaleIDParser.getParent(oldDefault); 246 for (String replacement : defaultLocaleContent) { 247 if (replacement.startsWith(parent)) { 248 if (parent.equals(LocaleIDParser.getParent(replacement))) { 249 return replacement; 250 } 251 } 252 } 253 return null; 254 } 255 getLanguageScriptSpreadsheet(PrintWriter out)256 private static void getLanguageScriptSpreadsheet(PrintWriter out) { 257 out.println("#Lcode\tLanguageName\tStatus\tScode\tScriptName\tReferences"); 258 Pair<String, String> languageScript = new Pair<>("", ""); 259 for (String language : language_status_scripts.keySet()) { 260 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 261 for (BasicLanguageData.Type status : status_scripts.keySet()) { 262 for (String script : status_scripts.getAll(status)) { 263 String reference = language_script_references.get(languageScript.setFirst(language).setSecond( 264 script)); 265 out.println(language + "\t" + getLanguageName(language) + "\t" + status + "\t" + script + "\t" 266 + getDisplayScript(script) 267 + (reference == null ? "" : "\t" + reference)); 268 } 269 } 270 } 271 } 272 273 /** 274 * Write data in format: 275 * <languageData> 276 * <language type="aa" scripts="Latn" territories="DJ ER ET"/> 277 * 278 * @param sortedInput 279 */ writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput)280 private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput) { 281 double cutoff = 0.2; // 20% 282 283 // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), TreeSet.class); 284 LanguageTagParser ltp = new LanguageTagParser(); 285 Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = new TreeMap<>(); 286 //Map<String, Pair<String, String>> languageToBestCountry; 287 for (RowData rowData : sortedInput) { 288 if (rowData.countryCode.equals("ZZ")) continue; 289 ltp.set(rowData.languageCode); 290 String languageCode = ltp.getLanguage(); 291 Relation<BasicLanguageData.Type, String> status_territories = language_status_territories.get(languageCode); 292 if (status_territories == null) { 293 language_status_territories.put(languageCode, status_territories = Relation.of( 294 new TreeMap<BasicLanguageData.Type, Set<String>>(), 295 TreeSet.class)); 296 } 297 if (rowData.officialStatus.isMajor()) { 298 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode); 299 } else if (rowData.officialStatus.isOfficial() 300 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation 301 || rowData.getLanguagePopulation() >= 1000000) { 302 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode); 303 } 304 } 305 306 Set<String> allLanguages = new TreeSet<>(language_status_territories.keySet()); 307 allLanguages.addAll(language_status_scripts.keySet()); 308 // now add all the remaining language-script info 309 // <language type="sv" scripts="Latn" territories="AX FI SE"/> 310 Set<String> warnings = new LinkedHashSet<>(); 311 out.println("\t<languageData>"); 312 for (String languageSubtag : allLanguages) { 313 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(languageSubtag); 314 Relation<BasicLanguageData.Type, String> status_territories = language_status_territories 315 .get(languageSubtag); 316 317 // check against old: 318 Map<BasicLanguageData.Type, BasicLanguageData> oldData = supplementalData 319 .getBasicLanguageDataMap(languageSubtag); 320 if (oldData == null) { 321 oldData = Collections.emptyMap(); 322 } 323 324 EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = new EnumMap<>( 325 BasicLanguageData.Type.class); 326 for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) { 327 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status); 328 Set<String> territories = status_territories == null ? null : status_territories.getAll(status); 329 if (scripts == null && territories == null) continue; 330 BasicLanguageData bld = new BasicLanguageData(); 331 bld.setTerritories(territories); 332 bld.setScripts(scripts); 333 bld.setType(status); 334 bld.freeze(); 335 newData.put(status, bld); 336 } 337 338 // compare 339 if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) { 340 for (String problem : compare(oldData, newData)) { 341 warnings.add(BadItem.DETAIL.toString("changing <languageData>", languageSubtag 342 + "\t" + english.getName(languageSubtag), problem)); 343 } 344 } 345 346 for (BasicLanguageData bld : newData.values()) { 347 Set<String> scripts = bld.getScripts(); 348 Set<String> territories = bld.getTerritories(); 349 BasicLanguageData.Type status = bld.getType(); 350 out.println("\t\t<language type=\"" + languageSubtag + "\"" 351 + (scripts.isEmpty() ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") 352 + (territories.isEmpty() ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") 353 + (status == BasicLanguageData.Type.primary ? "" : " alt=\"secondary\"") 354 + "/>"); 355 } 356 } 357 out.println("\t</languageData>"); 358 for (String s : warnings) { 359 if (s.contains("!")) { 360 System.out.println(s); 361 } 362 } 363 for (String s : warnings) { 364 if (!s.contains("!")) { 365 System.out.println(s); 366 } 367 } 368 } 369 compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)370 private static List<String> compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, 371 Map<BasicLanguageData.Type, BasicLanguageData> newData) { 372 Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true); 373 Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true); 374 List<String> result = new ArrayList<>(); 375 StringBuilder temp = new StringBuilder(); 376 for (String s : Builder.with(new LinkedHashSet<String>()).addAll(oldDataToType.keySet()) 377 .addAll(newDataToType.keySet()).get()) { 378 BasicLanguageData.Type oldValue = oldDataToType.get(s); 379 BasicLanguageData.Type newValue = newDataToType.get(s); 380 if (!CldrUtility.equals(oldValue, newValue)) { 381 temp.setLength(0); 382 temp.append("[").append(s).append(":") 383 .append(english.getName(s.length() == 4 ? "script" : "region", s)).append("] "); 384 if (oldValue == null) { 385 temp.append(" added as ").append(newValue); 386 } else if (newValue == null) { 387 temp.append(" REMOVED!"); 388 } else if (oldValue == BasicLanguageData.Type.primary) { 389 temp.append(" DOWNGRADED TO! ").append(newValue); 390 } else { 391 temp.append(" upgraded to ").append(newValue); 392 } 393 result.add(temp.toString()); 394 } 395 } 396 result.add(newData.toString()); 397 return result; 398 } 399 getDataToType( Collection<BasicLanguageData> collection, boolean script)400 private static Map<String, BasicLanguageData.Type> getDataToType( 401 Collection<BasicLanguageData> collection, boolean script) { 402 Map<String, BasicLanguageData.Type> result = new TreeMap<>(); 403 for (BasicLanguageData i : collection) { 404 for (String s : i.getScripts()) { 405 result.put(s, i.getType()); 406 } 407 for (String s : i.getTerritories()) { 408 result.put(s, i.getType()); 409 } 410 } 411 return result; 412 } 413 checkBasicData(Map<String, RowData> localeToRowData)414 private static void checkBasicData(Map<String, RowData> localeToRowData) { 415 // find languages with multiple scripts 416 Relation<String, String> languageToScripts = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 417 for (String languageSubtag : language2BasicLanguageData.keySet()) { 418 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 419 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), item.getScripts()); 420 } 421 } 422 // get primary combinations 423 Set<String> primaryCombos = new TreeSet<>(); 424 Set<String> basicCombos = new TreeSet<>(); 425 for (String languageSubtag : language2BasicLanguageData.keySet()) { 426 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 427 Set<String> scripts = new TreeSet<>(); 428 scripts.addAll(item.getScripts()); 429 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts); 430 if (scripts.size() == 0) { 431 scripts.add("Zzzz"); 432 } 433 Set<String> territories = new TreeSet<>(); 434 territories.addAll(item.getTerritories()); 435 if (territories.size() == 0) { 436 territories.add("ZZ"); 437 continue; 438 } 439 440 for (String script : scripts) { 441 for (String territory : territories) { 442 String locale = StandardCodes.fixLanguageTag(languageSubtag) 443 // + (script.equals("Zzzz") ? "" : languageToScripts.getAll(languageSubtag).size() <= 1 ? "" 444 // : "_" + script) 445 + (territories.equals("ZZ") ? "" : "_" + territory); 446 if (item.getType() != BasicLanguageData.Type.secondary) { 447 primaryCombos.add(locale); 448 } 449 basicCombos.add(locale); 450 } 451 } 452 } 453 } 454 Set<String> populationOver20 = new TreeSet<>(); 455 Set<String> population = new TreeSet<>(); 456 LanguageTagParser ltp = new LanguageTagParser(); 457 for (String rawLocale : localeToRowData.keySet()) { 458 ltp.set(rawLocale); 459 String locale = ltp.getLanguage() + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion()); 460 population.add(locale); 461 RowData rowData = localeToRowData.get(rawLocale); 462 if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2 463 //|| rowData.getLanguagePopulation() > 900000 464 ) { 465 populationOver20.add(locale); 466 } else { 467 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData( 468 ltp.getLanguageScript(), ltp.getRegion()); 469 if (popData != null && popData.getOfficialStatus().isOfficial()) { 470 populationOver20.add(locale); 471 } 472 } 473 } 474 Set<String> inBasicButNotPopulation = new TreeSet<>(primaryCombos); 475 476 inBasicButNotPopulation.removeAll(population); 477 for (String locale : inBasicButNotPopulation) { 478 ltp.set(locale); 479 String region = ltp.getRegion(); 480 String language = ltp.getLanguage(); 481 if (!sc.isModernLanguage(language)) continue; 482 PopulationData popData = supplementalData.getPopulationDataForTerritory(region); 483 // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10% 484 BadItem.WARNING.show("In Basic Data but not Population > 20%", 485 getDisplayCountry(region) 486 + "\t" + region 487 + "\t\"" + formatNumber(popData.getPopulation(), 0, false) + "\"" 488 + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false) 489 + "\"" 490 + "\t\"" + formatPercent(popData.getGdp(), 0, false) + "\"" 491 + "\t" + "" 492 + "\t" + getLanguageName(language) 493 + "\t" + language 494 + "\t" + -1 495 + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false) 496 + "\""); 497 } 498 499 Set<String> inPopulationButNotBasic = new TreeSet<>(populationOver20); 500 inPopulationButNotBasic.removeAll(basicCombos); 501 for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext();) { 502 String locale = it.next(); 503 if (locale.endsWith("_ZZ")) { 504 it.remove(); 505 } 506 } 507 for (String locale : inPopulationButNotBasic) { 508 BadItem.WARNING.show("In Population>20% but not Basic Data", locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString()); 509 } 510 } 511 512 static class LanguageInfo { 513 static LanguageInfo INSTANCE = new LanguageInfo(); 514 515 Map<String, Set<String>> languageToScripts = new TreeMap<>(); 516 Map<String, Set<String>> languageToRegions = new TreeMap<>(); 517 Map<String, Comments> languageToComments = new TreeMap<>(); 518 519 Map<String, Set<String>> languageToScriptsAlt = new TreeMap<>(); 520 Map<String, Set<String>> languageToRegionsAlt = new TreeMap<>(); 521 Map<String, Comments> languageToCommentsAlt = new TreeMap<>(); 522 LanguageInfo()523 private LanguageInfo() { 524 cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 525 //Set<String> available = cldrFactory.getAvailable(); 526 CLDRFile supplemental = cldrFactory.make("supplementalData", true); 527 for (Iterator<String> it = supplemental.iterator("//supplementalData/languageData/language"); it.hasNext();) { 528 String xpath = it.next(); 529 XPathParts parts = XPathParts.getFrozenInstance(xpath); 530 Map<String, String> x = parts.getAttributes(-1); 531 boolean alt = x.containsKey("alt"); 532 String lang = x.get("type"); 533 List<String> scripts = getAttributeList(x, "scripts"); 534 if (scripts != null) { 535 if (alt) { 536 putAll(languageToScriptsAlt, lang, new LinkedHashSet<>(scripts)); 537 } else { 538 putAll(languageToScripts, lang, new LinkedHashSet<>(scripts)); 539 } 540 } 541 List<String> regions = getAttributeList(x, "territories"); 542 if (regions != null) { 543 if (alt) { 544 putAll(languageToRegionsAlt, lang, new LinkedHashSet<>(regions)); 545 } else { 546 putAll(languageToRegions, lang, new LinkedHashSet<>(regions)); 547 } 548 } 549 } 550 } 551 getAttributeList(Map<String, String> x, String attribute)552 private List<String> getAttributeList(Map<String, String> x, String attribute) { 553 List<String> scripts = null; 554 String scriptString = x.get(attribute); 555 if (scriptString != null) { 556 scripts = Arrays.asList(scriptString.split("\\s+")); 557 } 558 return scripts; 559 } 560 } 561 putUnique(Map<K, V> map, K key, V value)562 private static <K, V> void putUnique(Map<K, V> map, K key, V value) { 563 V oldValue = map.get(key); 564 if (oldValue != null && !oldValue.equals(value)) { 565 throw new IllegalArgumentException("Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">"); 566 } 567 map.put(key, value); 568 } 569 putAll(Map<K, Set<W>> map, K key, Set<W> values)570 private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) { 571 Set<W> oldValue = map.get(key); 572 if (oldValue == null) { 573 map.put(key, values); 574 } else { 575 oldValue.addAll(values); 576 } 577 } 578 579 // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, official_minority}; 580 581 static class RowData implements Comparable<Object> { 582 private final String countryCode; 583 private final double countryGdp; 584 private final double countryLiteracy; 585 private final double countryPopulation; 586 private final String languageCode; 587 private final OfficialStatus officialStatus; 588 private final double languagePopulation; 589 private final double languageLiteracy; 590 private final String comment; 591 private final String notes; 592 private final String badLanguageName; 593 private final boolean relativeLanguagePopulation; 594 // String badLanguageCode = ""; 595 private final static Set<String> doneCountries = new HashSet<>(); 596 597 private final static Set<String> countryCodes = sc.getGoodAvailableCodes("territory"); 598 RowData(String country, String language)599 public RowData(String country, String language) { 600 this.countryCode = country; 601 this.languageCode = language; 602 badLanguageName = country = language = notes = comment = ""; 603 officialStatus = OfficialStatus.unknown; 604 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 605 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 606 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 607 languagePopulation = languageLiteracy = Double.NaN; 608 relativeLanguagePopulation = false; 609 } 610 RowData(List<String> row)611 RowData(List<String> row) throws ParseException { 612 countryCode = fixCountryCode(row.get(COUNTRY_CODE), row); 613 614 if (!countryCodes.contains(countryCode)) { 615 System.err.println("WRONG COUNTRY CODE: " + row); 616 } 617 618 double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION)); 619 double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1); 620 621 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 622 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 623 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 624 625 String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_'); 626 if (officialStatusString.equals("national")) { 627 officialStatusString = "official"; 628 } else if (officialStatusString.equals("regional_official")) { 629 officialStatusString = "official_regional"; 630 } else if (officialStatusString.length() == 0 || officialStatusString.equals("uninhabited")) { 631 officialStatusString = "unknown"; 632 } 633 try { 634 officialStatus = OfficialStatus.valueOf(officialStatusString); 635 } catch (RuntimeException e) { 636 throw new IllegalArgumentException("Can't interpret offical-status: " + officialStatusString); 637 } 638 639 String languageCode1 = row.get(LANGUAGE_CODE); 640 if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) { 641 languageCode1 = languageCode1.substring(1); 642 } 643 languageCode = fixLanguageCode(languageCode1, row); 644 645 if (doneCountries.contains(countryCode) == false) { 646 // showDiff(countryGdp1, countryGdp); 647 // showDiff(countryLiteracy1, countryLiteracy); 648 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false); 649 doneCountries.add(countryCode); 650 } 651 652 double languagePopulation1 = parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1) 653 * countryPopulation1; 654 if ((officialStatus.isMajor()) 655 && languagePopulation1 * 100 < countryPopulation && languagePopulation1 < 1000000) { 656 BadItem.WARNING.show("official language has population < 1% of country & < 1,000,000", languageCode + ", " + Math.round(languagePopulation1), 657 row); 658 } 659 if (languagePopulation1 < 0.999) { 660 BadItem.WARNING.show("suspect language population, < 1", languageCode + ", " + Math.round(languagePopulation1), row); 661 } 662 if (languagePopulation1 > 10000) { 663 relativeLanguagePopulation = true; 664 languagePopulation1 = languagePopulation1 * countryPopulation / countryPopulation1; // correct the 665 // values 666 } else { 667 relativeLanguagePopulation = false; 668 } 669 if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) { 670 BadItem.ERROR.show("language population > country population", Math.round(languagePopulation1) + " > " + countryPopulation, row); 671 } 672 languagePopulation = languagePopulation1 < countryPopulation ? languagePopulation1 : countryPopulation; 673 674 if (SHOW_DIFF) 675 showDiff(languagePopulation1 / countryPopulation1, languagePopulation / countryPopulation, 0.01, true); 676 677 String stringLanguageLiteracy = row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY); 678 double languageLiteracy1 = stringLanguageLiteracy.length() == 0 ? countryLiteracy 679 : parsePercent(stringLanguageLiteracy, languagePopulation); 680 if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) { 681 languageLiteracy1 = countryLiteracy; // correct the values 682 } 683 languageLiteracy = languageLiteracy1; 684 685 if (row.size() > COMMENT) { 686 comment = row.get(COMMENT); 687 } else { 688 comment = ""; 689 } 690 if (row.size() > NOTES) { 691 notes = row.get(NOTES); 692 } else { 693 notes = ""; 694 } 695 badLanguageName = row.get(BAD_LANGUAGE_NAME); 696 } 697 showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)698 private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) { 699 final double diff = new_a / a - 1; 700 if (Math.abs(diff) > maxRelativeDiff) { 701 System.out.println(formatPercent(diff, 0, false) 702 + "\t" + countryCode + "\t" + getDisplayCountry(countryCode) 703 + (showLang ? "\t" + languageCode + "\t" + getLanguageName(languageCode) : "") 704 + "\t" + formatNumber(a, 0, false) + "\t=>\t" + formatNumber(new_a, 0, false)); 705 } 706 } 707 roundToPartsPer(double a, double whole)708 private double roundToPartsPer(double a, double whole) { 709 // break this out just to make it easier to follow. 710 double log10 = Math.log10(a / whole); 711 long digitsFound = (long) (log10); 712 long factor = (long) (Math.pow(10, digitsFound)); 713 double rounded = Math.round(a / factor); 714 double result = rounded * factor; 715 // if (Math.abs(result - a) >= 1) { 716 // System.out.println("Rounding " + a + " => " + result); 717 // } 718 return result; 719 } 720 isApproximatelyEqual(double a, double b, double epsilon)721 private static boolean isApproximatelyEqual(double a, double b, double epsilon) { 722 return a == b || Math.abs(a - b) < epsilon; 723 } 724 isApproximatelyGreater(double a, double b, double epsilon)725 private static boolean isApproximatelyGreater(double a, double b, double epsilon) { 726 return a > b + epsilon; 727 } 728 parseDecimal(String numericRepresentation)729 double parseDecimal(String numericRepresentation) throws ParseException { 730 try { 731 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN; 732 Number result = nf.parse(numericRepresentation); 733 // if (result == null) return Double.NaN; 734 return result.doubleValue(); 735 } catch (ParseException e) { 736 throw e; 737 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation + 738 // ">").initCause(e); 739 } 740 } 741 parsePercent(String numericRepresentation, double baseValue)742 double parsePercent(String numericRepresentation, double baseValue) throws ParseException { 743 try { 744 double result; 745 if (numericRepresentation.contains("%")) { 746 Number result0 = pf.parse(numericRepresentation); 747 result = result0.doubleValue(); 748 } else { 749 Number result0 = nf.parse(numericRepresentation); 750 result = result0.doubleValue() / baseValue; 751 } 752 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN; 753 // if (result == null) return Double.NaN; 754 return result; 755 } catch (ParseException e) { 756 throw e; 757 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation + 758 // ">").initCause(e); 759 } 760 } 761 getLanguageLiteratePopulation()762 public double getLanguageLiteratePopulation() { 763 return languageLiteracy * languagePopulation; 764 } 765 766 /** 767 * Get the weighted population 768 * 769 * @param weightIfNotOfficial 770 * @return 771 */ getLanguageLiteratePopulation(double weightIfNotOfficial)772 public double getLanguageLiteratePopulation(double weightIfNotOfficial) { 773 double result = languageLiteracy * languagePopulation; 774 if (!officialStatus.isMajor()) { 775 result *= weightIfNotOfficial; 776 } 777 return result; 778 } 779 780 @Override compareTo(Object o)781 public int compareTo(Object o) { 782 RowData that = (RowData) o; 783 int result; 784 if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) return result; 785 if (languagePopulation > that.languagePopulation) return -1; // descending 786 if (languagePopulation < that.languagePopulation) return 1; 787 if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) return result; 788 return 0; 789 } 790 toStringHeader()791 public static String toStringHeader() { 792 return "countryCode" + "\t" + "countryPopulation" + "\t" + "countryGdp" 793 + "\t" + "countryLiteracy" 794 + "\t" + "languagePopulation" + "\t" + "languageCode" 795 + "\t" + "writingPopulation"; 796 } 797 798 @Override toString()799 public String toString() { 800 return countryCode + "\t" + countryPopulation + "\t" + countryGdp 801 + "\t" + countryLiteracy 802 + "\t" + languagePopulation + "\t" + languageCode 803 + "\t" + languageLiteracy; 804 } 805 toString(boolean b)806 public String toString(boolean b) { 807 return "region:\t" + getCountryCodeAndName(countryCode) 808 + "\tpop:\t" + countryPopulation 809 + "\tgdp:\t" + countryGdp 810 + "\tlit:\t" + countryLiteracy 811 + "\tlang:\t" + getLanguageCodeAndName(languageCode) 812 + "\tpop:\t" + languagePopulation 813 + "\tlit:\t" + languageLiteracy; 814 } 815 816 static boolean MARK_OUTPUT = false; 817 getRickLanguageCode()818 public String getRickLanguageCode() { 819 if (languageCode.contains("_")) return languageCode; 820 Source source = Iso639Data.getSource(languageCode); 821 if (source == null) { 822 return "§" + languageCode; 823 } 824 if (MARK_OUTPUT) { 825 if (source == Source.ISO_639_3) { 826 return "*" + languageCode; 827 } 828 } 829 return languageCode; 830 } 831 832 static Map<String, String> oldToFixed = new HashMap<>(); 833 getRickLanguageName()834 public String getRickLanguageName() { 835 String cldrResult = getExcelQuote(english.getName(languageCode, true)); 836 // String result = getRickLanguageName2(); 837 // if (!result.equalsIgnoreCase(cldrResult)) { 838 // if (null == oldToFixed.put(result, cldrResult)) { 839 // System.out.println("## " + result + "!=" + cldrResult); 840 // } 841 // } 842 return cldrResult; 843 } 844 getRickLanguageName2()845 public String getRickLanguageName2() { 846 String result = new ULocale(languageCode).getDisplayName(); 847 if (!result.equals(languageCode)) return getExcelQuote(result); 848 Set<String> names = Iso639Data.getNames(languageCode); 849 if (names != null && names.size() != 0) { 850 if (MARK_OUTPUT) { 851 return getExcelQuote("*" + names.iterator().next()); 852 } else { 853 return getExcelQuote(names.iterator().next()); 854 } 855 } 856 return getExcelQuote("§" + badLanguageName); 857 } 858 getCountryName()859 public String getCountryName() { 860 return getExcelQuote(getDisplayCountry(countryCode)); 861 } 862 getCountryGdpString()863 public String getCountryGdpString() { 864 return getExcelQuote(formatNumber(countryGdp, 0, false)); 865 } 866 getCountryLiteracyString()867 public String getCountryLiteracyString() { 868 return formatPercent(countryLiteracy, 2, false); 869 } 870 getCountryPopulationString()871 public String getCountryPopulationString() { 872 return getExcelQuote(formatNumber(countryPopulation, 0, false)); 873 } 874 getLanguageLiteracyString()875 public String getLanguageLiteracyString() { 876 return formatPercent(languageLiteracy, 2, false); 877 } 878 getLanguagePopulationString()879 public String getLanguagePopulationString() { 880 881 try { 882 final double percent = languagePopulation / countryPopulation; 883 return getExcelQuote(relativeLanguagePopulation 884 && percent > 0.03 885 && languagePopulation > 10000 886 ? formatPercent(percent, 2, false) 887 : formatNumber(languagePopulation, 3, false)); 888 } catch (IllegalArgumentException e) { 889 return "NaN"; 890 } 891 } 892 getLanguagePopulation()893 private double getLanguagePopulation() { 894 return languagePopulation; 895 } 896 897 } 898 getExcelQuote(String comment)899 public static String getExcelQuote(String comment) { 900 return comment == null || comment.length() == 0 ? "" 901 : comment.contains(",") ? '"' + comment + '"' 902 : comment.contains("\"") ? '"' + comment.replace("\"", "\"\"") + '"' 903 : comment; 904 } 905 getCountryCodeAndName(String code)906 public static String getCountryCodeAndName(String code) { 907 if (code == null) return null; 908 return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]"; 909 } 910 911 static class RickComparator implements Comparator<RowData> { 912 @Override compare(RowData me, RowData that)913 public int compare(RowData me, RowData that) { 914 int result; 915 if (0 != (result = GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) return result; 916 if (0 != (result = GENERAL_COLLATOR.compare(me.getRickLanguageName(), that.getRickLanguageName()))) 917 return result; 918 return me.compareTo(that); 919 } 920 } 921 writeTerritoryLanguageData(PrintWriter out, List<String> failures, Set<RowData> sortedInput)922 private static void writeTerritoryLanguageData(PrintWriter out, List<String> failures, Set<RowData> sortedInput) { 923 924 String lastCountryCode = ""; 925 boolean first = true; 926 LanguageTagParser ltp = new LanguageTagParser(); 927 928 out.println(" <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->"); 929 out.println("\t<territoryInfo>"); 930 931 for (RowData row : sortedInput) { 932 String countryCode = row.countryCode; 933 934 double countryPopulationRaw = row.countryPopulation; 935 double countryPopulation = countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 2); 936 double languageLiteracy = row.languageLiteracy; 937 double countryLiteracy = row.countryLiteracy; 938 939 double countryGDPRaw = row.countryGdp; 940 long countryGDP = Math.round(countryGDPRaw / gdpFactor); 941 942 String languageCode = row.languageCode; 943 944 double languagePopulationRaw = row.getLanguagePopulation(); 945 double languagePopulation = languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw, 946 // 2); 947 948 double languagePopulationPercent = languagePopulation / countryPopulation; 949 // Utility.roundToDecimals(Math.min(100, Math.max(0, 950 // languagePopulation*100 / (double)countryPopulation)),3); 951 952 if (!countryCode.equals(lastCountryCode)) { 953 if (first) { 954 first = false; 955 } else { 956 out.println("\t\t</territory>"); 957 } 958 out.print("\t\t<territory type=\"" + countryCode + "\"" 959 + " gdp=\"" + formatNumber(countryGDP, 4, true) + "\"" 960 + " literacyPercent=\"" + formatPercent(countryLiteracy, 3, true) + "\"" 961 + " population=\"" + formatNumber(countryPopulation, 6, true) + "\">"); 962 lastCountryCode = countryCode; 963 out.println("\t<!--" + getDisplayCountry(countryCode) + "-->"); 964 } 965 966 if (languageCode.length() != 0 967 && languagePopulationPercent > 0.0000 968 && (ALLOW_SMALL_NUMBERS || languagePopulationPercent >= 1 || languagePopulationRaw > 100000 969 || languageCode.equals("haw") || row.officialStatus.isOfficial())) { 970 // add best case 971 addBestRegion(languageCode, countryCode, languagePopulationRaw); 972 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript(); 973 if (!baseScriptLanguage.equals(languageCode)) { 974 addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw); 975 } 976 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage(); 977 if (!baseLanguage.equals(baseScriptLanguage)) { 978 addBestRegion(baseLanguage, countryCode, languagePopulationRaw); 979 addBestScript(baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw); 980 } 981 982 if (languageLiteracy != countryLiteracy) { 983 int debug = 0; 984 } 985 out.print("\t\t\t<languagePopulation type=\"" 986 + languageCode 987 + "\"" 988 + (DoubleMath.fuzzyCompare(languageLiteracy, countryLiteracy, 0.0001) == 0 ? "" 989 : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) == 0 ? " writingPercent=\"" : " literacyPercent=\"") 990 + formatPercent(languageLiteracy, 2, true) + "\"") 991 + " populationPercent=\"" + formatPercent(languagePopulationPercent, 2, true) + "\"" 992 + (row.officialStatus.isOfficial() ? " officialStatus=\"" + row.officialStatus + "\"" : "") 993 + references.addReference(row.notes) 994 + "/>"); 995 out.println("\t<!--" + getLanguageName(languageCode) + "-->"); 996 } else if (!row.countryCode.equals("ZZ")) { 997 failures.add(BadItem.ERROR.toString("too few speakers: suspect line", languageCode, row.toString(true))); 998 } 999 // if (first) { 1000 if (false) System.out.print( 1001 "countryCode: " + countryCode + "\t" 1002 + "countryPopulation: " + countryPopulation + "\t" 1003 + "countryGDP: " + countryGDP + "\t" 1004 + "languageCode: " + languageCode + "\t" 1005 + "languagePopulation: " + languagePopulation + CldrUtility.LINE_SEPARATOR); 1006 // } 1007 } 1008 1009 out.println("\t\t</territory>"); 1010 out.println("\t</territoryInfo>"); 1011 } 1012 getDisplayCountry(String countryCode)1013 private static String getDisplayCountry(String countryCode) { 1014 String result = getULocaleCountryName(countryCode); 1015 if (!result.equals(countryCode)) { 1016 return result; 1017 } 1018 result = sc.getData("territory", countryCode); 1019 if (result != null) { 1020 return result; 1021 } 1022 return countryCode; 1023 // new ULocale("und-" + countryCode).getDisplayCountry() 1024 } 1025 getDisplayScript(String scriptCode)1026 private static String getDisplayScript(String scriptCode) { 1027 String result = getULocaleScriptName(scriptCode); 1028 if (!result.equals(scriptCode)) { 1029 return result; 1030 } 1031 result = sc.getData("territory", scriptCode); 1032 if (result != null) { 1033 return result; 1034 } 1035 return scriptCode; 1036 // new ULocale("und-" + countryCode).getDisplayCountry() 1037 } 1038 getLanguageName(String languageCode)1039 private static String getLanguageName(String languageCode) { 1040 String result = getULocaleLocaleName(languageCode); 1041 if (!result.equals(languageCode)) return result; 1042 Set<String> names = Iso639Data.getNames(languageCode); 1043 if (names != null && names.size() != 0) { 1044 return names.iterator().next(); 1045 } 1046 return languageCode; 1047 } 1048 1049 static class References { 1050 Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<>(); 1051 Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<>(); 1052 Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences(); 1053 Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<>(); 1054 { 1055 for (String Rxxx : Rxxx_to_oldReferences.keySet()) { Rxxx_to_oldReferences.get(Rxxx)1056 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx); 1057 } 1058 } 1059 Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher(""); 1060 1061 static int referenceStart = 1000; 1062 1063 /** 1064 * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference. 1065 * 1066 * @param rawReferenceText 1067 * @return 1068 */ addReference(String rawReferenceText)1069 private String addReference(String rawReferenceText) { 1070 if (rawReferenceText == null || rawReferenceText.length() == 0) return ""; 1071 Pair<String, String> p; 1072 if (URI.reset(rawReferenceText).matches()) { 1073 p = new Pair<>(URI.group(1), URI.group(2) == null || URI.group(2).length() == 0 ? "[missing]" 1074 : URI.group(2)).freeze(); 1075 } else { 1076 p = new Pair<String, String>(null, rawReferenceText).freeze(); 1077 } 1078 1079 String Rxxx = reference_to_Rxxx.get(p); 1080 if (Rxxx == null) { // add new 1081 Rxxx = oldReferences_to_Rxxx.get(p); 1082 if (Rxxx != null) { // if old, just keep number 1083 p = Rxxx_to_oldReferences.get(Rxxx); 1084 } else { // find an empty number 1085 while (true) { 1086 Rxxx = "R" + (referenceStart++); 1087 if (Rxxx_to_reference.get(Rxxx) == null && Rxxx_to_oldReferences.get(Rxxx) == null) { 1088 break; 1089 } 1090 } 1091 } 1092 // add to new references 1093 reference_to_Rxxx.put(p, Rxxx); 1094 Rxxx_to_reference.put(Rxxx, p); 1095 } 1096 // references="R034" 1097 return " references=\"" + Rxxx + "\""; 1098 } 1099 getReferenceHTML(String Rxxx)1100 String getReferenceHTML(String Rxxx) { 1101 Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails. 1102 String uri = p.getFirst(); 1103 String value = p.getSecond(); 1104 uri = uri == null ? "" : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\""; 1105 value = value == null ? "[missing]" : TransliteratorUtilities.toHTML.transliterate(value); 1106 return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>"; 1107 } 1108 printReferences(PrintWriter out)1109 void printReferences(PrintWriter out) { 1110 // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference> 1111 out.println("\t<references>"); 1112 for (String Rxxx : Rxxx_to_reference.keySet()) { 1113 out.println(getReferenceHTML(Rxxx)); 1114 } 1115 out.println("\t</references>"); 1116 } 1117 } 1118 1119 static References references = new References(); 1120 getExcelData(List<String> failures, Map<String, RowData> localeToRowData)1121 private static Set<RowData> getExcelData(List<String> failures, Map<String, RowData> localeToRowData) 1122 throws IOException { 1123 1124 LanguageTagParser ltp = new LanguageTagParser(); 1125 1126 String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/"; 1127 final String ricksFile = "country_language_population_raw.txt"; 1128 System.out.println("\n# Problems in " + ricksFile + "\n"); 1129 List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data(ricksFile)); 1130 1131 Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language"); 1132 1133 Set<String> territories = new TreeSet<>(sc.getGoodAvailableCodes("territory")); 1134 territories.removeAll(supplementalData.getContainers()); 1135 territories.remove("EU"); 1136 territories.remove("QO"); 1137 1138 Set<String> countriesNotFound = new TreeSet<>(territories); 1139 Set<OfficialStatus> statusFound = new TreeSet<>(); 1140 Set<String> countriesWithoutOfficial = new TreeSet<>(territories); 1141 countriesWithoutOfficial.remove("ZZ"); 1142 1143 Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<>(); 1144 1145 Set<String> languagesNotFound = new TreeSet<>(languages); 1146 Set<RowData> sortedInput = new TreeSet<>(); 1147 int count = 0; 1148 for (List<String> row : input) { 1149 ++count; 1150 if (count == 1 || row.size() <= COUNTRY_GDP) { 1151 failures.add(join(row, "\t") + "\tShort row"); 1152 continue; 1153 } 1154 try { 1155 RowData x = new RowData(row); 1156 if (x.officialStatus.isOfficial()) { 1157 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(x.countryCode); 1158 if (largestOffical == null) { 1159 countryToLargestOfficialLanguage.put(x.countryCode, 1160 Row.of(x.languageCode, x.languagePopulation)); 1161 } else if (largestOffical.get1() < x.languagePopulation) { 1162 largestOffical.set0(x.languageCode); 1163 largestOffical.set1(x.languagePopulation); 1164 } 1165 } 1166 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) { 1167 countriesWithoutOfficial.remove(x.countryCode); 1168 } 1169 if (!checkCode(LstrType.region, x.countryCode, row)) continue; 1170 statusFound.add(x.officialStatus); 1171 countriesNotFound.remove(x.countryCode); 1172 languagesNotFound.remove(x.languageCode); 1173 if (x.languageCode.contains("_")) { 1174 ltp.set(x.languageCode); 1175 languagesNotFound.remove(ltp.getLanguage()); 1176 if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue; 1177 if (!checkCode(LstrType.script, ltp.getScript(), row)) continue; 1178 } 1179 String locale = x.languageCode + "_" + x.countryCode; 1180 if (localeToRowData.get(locale) != null) { 1181 BadItem.ERROR.show("duplicate data", x.languageCode + " with " + x.countryCode, row); 1182 } 1183 localeToRowData.put(locale, x); 1184 sortedInput.add(x); 1185 } catch (ParseException e) { 1186 failures.add(join(row, "\t") + "\t" + e.getMessage() + "\t" 1187 + join(Arrays.asList(e.getStackTrace()), ";\t")); 1188 } catch (RuntimeException e) { 1189 throw (RuntimeException) new IllegalArgumentException("Failure on line " + count + ")\t" + row) 1190 .initCause(e); 1191 } 1192 } 1193 // System.out.println("Note: the following Status values were found in the data: " + 1194 // CldrUtility.join(statusFound, " | ")); 1195 1196 // make sure we have something 1197 for (String country : countriesNotFound) { 1198 RowData x = new RowData(country, "und"); 1199 sortedInput.add(x); 1200 } 1201 for (String language : languagesNotFound) { 1202 RowData x = new RowData("ZZ", language); 1203 sortedInput.add(x); 1204 } 1205 1206 for (RowData row : sortedInput) { 1207 // see which countries have languages that are larger than any offical language 1208 1209 if (!row.officialStatus.isOfficial()) { 1210 //String country = row.countryCode; 1211 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(row.countryCode); 1212 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) { 1213 BadItem.WARNING.show("language population > all official languages", getLanguageCodeAndName(largestOffical.get0()), row.toString(true)); 1214 } 1215 } 1216 1217 // see which countries are missing an official language 1218 if (!countriesWithoutOfficial.contains(row.countryCode)) continue; 1219 BadItem.ERROR.show("missing official language", row.getCountryName() + "\t" + row.countryCode, row.toString(true)); 1220 countriesWithoutOfficial.remove(row.countryCode); 1221 } 1222 1223 // write out file for rick 1224 PrintWriter log = FileUtilities.openUTF8Writer(dir, ricksFile); 1225 log.println( 1226 "*\tCName" + 1227 "\tCCode" + 1228 "\tCPopulation" + 1229 "\tCLiteracy" + 1230 "\tCGdp" + 1231 "\tOfficialStatus" + 1232 "\tLanguage" + 1233 "\tLCode" + 1234 "\tLPopulation" + 1235 "\tWritingPop" + 1236 "\tReferences" + 1237 "\tNotes"); 1238 RickComparator rickSorting = new RickComparator(); 1239 Set<RowData> rickSorted = new TreeSet<>(rickSorting); 1240 rickSorted.addAll(sortedInput); 1241 1242 for (RowData row : rickSorted) { 1243 final String langLit = row.getLanguageLiteracyString(); 1244 final String countryLit = row.getCountryLiteracyString(); 1245 log.println( 1246 row.getCountryName() 1247 + "\t" + row.countryCode 1248 + "\t" + row.getCountryPopulationString() 1249 + "\t" + countryLit 1250 + "\t" + row.getCountryGdpString() 1251 + "\t" + (row.officialStatus == OfficialStatus.unknown ? "" : row.officialStatus) 1252 + "\t" + row.getRickLanguageName() 1253 + "\t" + row.getRickLanguageCode() 1254 + "\t" + row.getLanguagePopulationString() 1255 + "\t" + (langLit.equals(countryLit) ? "" : langLit) 1256 + "\t" + getExcelQuote(row.comment) 1257 + "\t" + getExcelQuote(row.notes)); 1258 } 1259 log.close(); 1260 return sortedInput; 1261 } 1262 getCldrParents(Set<String> available)1263 private static Set<String> getCldrParents(Set<String> available) { 1264 LanguageTagParser ltp2 = new LanguageTagParser(); 1265 Set<String> cldrParents = new TreeSet<>(); 1266 for (String locale : available) { 1267 if (skipLocales.contains(locale)) continue; 1268 try { 1269 ltp2.set(locale); 1270 } catch (RuntimeException e) { 1271 System.out.println("Skipping CLDR file: " + locale); 1272 continue; 1273 } 1274 String locale2 = ltp2.getLanguageScript(); 1275 if (locale2.equals("sh")) continue; 1276 // int lastPos = locale.lastIndexOf('_'); 1277 // if (lastPos < 0) continue; 1278 // String locale2 = locale.substring(0,lastPos); 1279 cldrParents.add(locale2); 1280 languageToMaxCountry.put(locale2, null); 1281 } 1282 //System.out.println("CLDR Parents: " + cldrParents); 1283 return cldrParents; 1284 } 1285 showFailures(List<String> failures)1286 private static void showFailures(List<String> failures) { 1287 if (failures.size() <= 1) { 1288 return; 1289 } 1290 System.out.println(); 1291 System.out.println("Failures in Output"); 1292 System.out.println(); 1293 1294 System.out.println(RowData.toStringHeader()); 1295 for (String failure : failures) { 1296 System.out.println(failure); 1297 } 1298 } 1299 getProcessedParent(String localeCode)1300 public static String getProcessedParent(String localeCode) { 1301 if (localeCode == null || localeCode.equals("root")) return null; 1302 int pos = localeCode.lastIndexOf('_'); 1303 if (pos < 0) return "root"; 1304 LanguageTagParser ltp = new LanguageTagParser(); 1305 String script = ltp.set(localeCode).getScript(); 1306 if (script.length() == 0) { 1307 return getFullyResolved(localeCode); 1308 } 1309 return localeCode.substring(0, pos); 1310 } 1311 getFullyResolved(String languageCode)1312 private static String getFullyResolved(String languageCode) { 1313 String result = defaultContent.get(languageCode); 1314 if (result != null) return result; 1315 // we missed. Try taking parent and trying again 1316 int pos = languageCode.length() + 1; 1317 while (true) { 1318 pos = languageCode.lastIndexOf('_', pos - 1); 1319 if (pos < 0) { 1320 return "***" + languageCode; 1321 } 1322 result = defaultContent.get(languageCode.substring(0, pos)); 1323 if (result != null) { 1324 LanguageTagParser ltp = new LanguageTagParser().set(languageCode); 1325 LanguageTagParser ltp2 = new LanguageTagParser().set(result); 1326 String region = ltp.getRegion(); 1327 if (region.length() == 0) { 1328 ltp.setRegion(ltp2.getRegion()); 1329 } 1330 String script = ltp.getScript(); 1331 if (script.length() == 0) { 1332 ltp.setScript(ltp2.getScript()); 1333 } 1334 return ltp.toString(); 1335 } 1336 } 1337 } 1338 1339 static Comparator<Iterable> firstElementComparator = new Comparator<Iterable>() { 1340 @Override 1341 public int compare(Iterable o1, Iterable o2) { 1342 int result = ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next())); 1343 assert result != 0; 1344 return result; 1345 } 1346 }; 1347 showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1348 private static void showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, 1349 Map<String, RowData> localeToRowData, 1350 Set<String> defaultLocaleContent) { 1351 1352 if (SHOW_OLD_DEFAULT_CONTENTS) { 1353 System.out.println(); 1354 System.out.println("Computing Defaults Contents"); 1355 System.out.println(); 1356 } 1357 1358 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 1359 Set<String> locales = new TreeSet<>(cldrFactory.getAvailable()); 1360 LocaleIDParser lidp = new LocaleIDParser(); 1361 1362 // add all the combinations of language, script, and territory. 1363 for (String locale : localeToRowData.keySet()) { 1364 String baseLanguage = lidp.set(locale).getLanguage(); 1365 if (locales.contains(baseLanguage) && !locales.contains(locale)) { 1366 locales.add(locale); 1367 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale); 1368 } 1369 } 1370 1371 // adding parents 1372 Set<String> toAdd = new TreeSet<>(); 1373 while (true) { 1374 for (String locale : locales) { 1375 String newguy = LocaleIDParser.getParent(locale); 1376 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) { 1377 toAdd.add(newguy); 1378 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy); 1379 } 1380 } 1381 if (toAdd.size() == 0) { 1382 break; 1383 } 1384 locales.addAll(toAdd); 1385 toAdd.clear(); 1386 } 1387 1388 // get sets of siblings 1389 Set<Set<String>> siblingSets = new TreeSet<>(firstElementComparator); 1390 Set<String> needsADoin = new TreeSet<>(locales); 1391 1392 Set<String> deprecatedLanguages = new TreeSet<>(); 1393 deprecatedLanguages.add("sh"); 1394 Set<String> deprecatedRegions = new TreeSet<>(); 1395 deprecatedRegions.add("YU"); 1396 deprecatedRegions.add("CS"); 1397 deprecatedRegions.add("ZZ"); 1398 1399 // first find all the language subtags that have scripts, and those we need to skip. Those are aliased-only 1400 Set<String> skippingItems = new TreeSet<>(); 1401 Set<String> hasAScript = new TreeSet<>(); 1402 //Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language); 1403 for (String locale : locales) { 1404 lidp.set(locale); 1405 if (lidp.getScript().length() != 0) { 1406 hasAScript.add(lidp.getLanguage()); 1407 } 1408 Set<LocaleIDParser.Level> levels = lidp.getLevels(); 1409 // must have no variants, must have either script or region, no deprecated elements 1410 if (levels.contains(LocaleIDParser.Level.Variants) // no variants 1411 || !(levels.contains(LocaleIDParser.Level.Script) 1412 || levels.contains(LocaleIDParser.Level.Region)) 1413 || deprecatedLanguages.contains(lidp.getLanguage()) 1414 || deprecatedRegions.contains(lidp.getRegion())) { 1415 // skip language-only locales, and ones with variants 1416 needsADoin.remove(locale); 1417 skippingItems.add(locale); 1418 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale); 1419 continue; 1420 } 1421 } 1422 // walk through the locales, getting the ones we care about. 1423 Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<>(); 1424 1425 for (String locale : new TreeSet<>(needsADoin)) { 1426 if (!needsADoin.contains(locale)) continue; 1427 lidp.set(locale); 1428 Set<Level> level = lidp.getLevels(); 1429 // skip locales that need scripts and don't have them 1430 if (!level.contains(LocaleIDParser.Level.Script) // no script 1431 && hasAScript.contains(lidp.getLanguage())) { 1432 needsADoin.remove(locale); 1433 skippingItems.add(locale); 1434 continue; 1435 } 1436 // get siblings 1437 Set<String> siblingSet = lidp.getSiblings(needsADoin); 1438 // if it has a script and region 1439 if (level.contains(LocaleIDParser.Level.Script) && level.contains(LocaleIDParser.Level.Region)) { 1440 double languageLiteratePopulation = 0; 1441 for (String localeID2 : siblingSet) { 1442 RowData rowData = localeToRowData.get(localeID2); 1443 if (rowData != null) { 1444 languageLiteratePopulation += rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1445 } 1446 } 1447 String parentID = LocaleIDParser.getParent(locale); 1448 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation); 1449 } 1450 1451 try { 1452 siblingSets.add(siblingSet); 1453 } catch (RuntimeException e) { 1454 e.printStackTrace(); 1455 } 1456 needsADoin.removeAll(siblingSet); 1457 } 1458 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("ConvertLanguageData Skipping: " + skippingItems); 1459 if (needsADoin.size() != 0) { 1460 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin); 1461 } 1462 1463 // walk through the data 1464 Set<String> skippingSingletons = new TreeSet<>(); 1465 1466 Set<String> missingData = new TreeSet<>(); 1467 for (Set<String> siblingSet : siblingSets) { 1468 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet); 1469 1470 if (false & siblingSet.size() == 1) { 1471 skippingSingletons.add(siblingSet.iterator().next()); 1472 continue; 1473 } 1474 // get best 1475 double best = Double.NEGATIVE_INFINITY; 1476 String bestLocale = "???"; 1477 Set<Pair<Double, String>> data = new TreeSet<>(); 1478 LanguageTagParser ltp = new LanguageTagParser(); 1479 for (String locale : siblingSet) { 1480 RowData rowData = localeToRowData.get(locale); 1481 double languageLiteratePopulation = -1; 1482 if (rowData != null) { 1483 languageLiteratePopulation = rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1484 } else { 1485 Double d = scriptLocaleToLanguageLiteratePopulation.get(locale); 1486 if (d != null) { 1487 languageLiteratePopulation = d; 1488 } else { 1489 final String region = ltp.set(locale).getRegion(); 1490 if (region.isEmpty() || StandardCodes.isCountry(region)) { 1491 missingData.add(locale); 1492 } 1493 } 1494 } 1495 data.add(new Pair<>(languageLiteratePopulation, locale)); 1496 if (best < languageLiteratePopulation) { 1497 best = languageLiteratePopulation; 1498 bestLocale = locale; 1499 } 1500 } 1501 // show it 1502 for (Pair<Double, String> datum : data) { 1503 if (SHOW_OLD_DEFAULT_CONTENTS) 1504 System.out.format( 1505 "\tContenders: %s %f (based on literate population)" + CldrUtility.LINE_SEPARATOR, 1506 datum.getSecond(), datum.getFirst()); 1507 } 1508 // System.out.format("\tPicking default content: %s %f (based on literate population)" + 1509 // Utility.LINE_SEPARATOR, bestLocale, best); 1510 // Hack to fix English 1511 // TODO Generalize in the future for other locales with non-primary scripts 1512 if (bestLocale.startsWith("en_")) { 1513 defaultLocaleContent.add("en_US"); 1514 } else { 1515 defaultLocaleContent.add(bestLocale); 1516 } 1517 } 1518 1519 for (String singleton : skippingSingletons) { 1520 BadItem.WARNING.show("skipping Singletons", singleton); 1521 } 1522 for (String missing : missingData) { 1523 BadItem.WARNING.show("Missing Data", missing); 1524 } 1525 1526 // LanguageTagParser ltp = new LanguageTagParser(); 1527 // Set<String> warnings = new LinkedHashSet(); 1528 // for (String languageCode : languageToMaxCountry.keySet()) { 1529 // CodeAndPopulation best = languageToMaxCountry.get(languageCode); 1530 // String languageSubtag = ltp.set(languageCode).getLanguage(); 1531 // String countryCode = "ZZ"; 1532 // double rawLanguagePopulation = -1; 1533 // if (best != null) { 1534 // countryCode = best.code; 1535 // rawLanguagePopulation = best.population; 1536 // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag); 1537 // if (regions == null || !regions.contains(countryCode)) { 1538 // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag); 1539 // if (regions2 == null || !regions2.contains(countryCode)) { 1540 // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + "/" + regions2); 1541 // } 1542 // } 1543 // } 1544 // String resolvedLanguageCode = languageCode + "_" + countryCode; 1545 // ltp.set(languageCode); 1546 // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode); 1547 // String script = ltp.getScript(); 1548 // if (script.length() == 0) { 1549 // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode); 1550 // if (bestScript != null) { 1551 // script = bestScript.code; 1552 // if (scripts == null || !scripts.contains(script)) { 1553 // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts); 1554 // } 1555 // } else { 1556 // script = "Zzzz"; 1557 // if (scripts == null) { 1558 // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode); 1559 // } 1560 // if (scripts != null) { 1561 // script = scripts.iterator().next(); 1562 // if (scripts.size() != 1) { 1563 // warnings.add("WARNING: " + languageCode + " => " + scripts); 1564 // } 1565 // } 1566 // } 1567 // if (scripts == null) { 1568 // warnings.add("Missing scripts for: " + languageCode); 1569 // } else if (scripts.size() == 1){ 1570 // script = ""; 1571 // } 1572 // resolvedLanguageCode = languageCode 1573 // + (script.length() == 0 ? "" : "_" + script) 1574 // + "_" + countryCode; 1575 // } 1576 // 1577 // 1578 // System.out.println( 1579 // resolvedLanguageCode 1580 // + "\t" + languageCode 1581 // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH) 1582 // + "\t" + countryCode 1583 // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH) 1584 // + "\t" + formatNumber(rawLanguagePopulation) 1585 // + (cldrParents.contains(languageCode) ? "\tCLDR" : "") 1586 // ); 1587 // if (languageCode.length() == 0) continue; 1588 // defaultContent.put(languageCode, resolvedLanguageCode); 1589 // } 1590 // for (String warning : warnings) { 1591 // System.out.println(warning); 1592 // } 1593 } 1594 1595 // private static void printDefaultContent(Set<String> defaultLocaleContent) { 1596 // String sep = Utility.LINE_SEPARATOR + "\t\t\t"; 1597 // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, PatternCache.get("(\\S)\\S*").matcher(""), 1598 // 80); 1599 // 1600 // Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 1601 // Log.println("\t\t/>"); 1602 // } 1603 getSuppressScript(String languageCode)1604 private static Object getSuppressScript(String languageCode) { 1605 // TODO Auto-generated method stub 1606 return null; 1607 } 1608 join(Collection c, String separator)1609 public static String join(Collection c, String separator) { 1610 StringBuffer result = new StringBuffer(); 1611 boolean first = true; 1612 for (Object x : c) { 1613 if (first) 1614 first = false; 1615 else 1616 result.append(separator); 1617 result.append(x); 1618 } 1619 return result.toString(); 1620 } 1621 addBestRegion(String languageCode, String countryCode, double languagePopulationRaw)1622 private static void addBestRegion(String languageCode, String countryCode, double languagePopulationRaw) { 1623 addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry); 1624 } 1625 addBestScript(String languageCode, String scriptCode, double languagePopulationRaw)1626 private static void addBestScript(String languageCode, String scriptCode, double languagePopulationRaw) { 1627 addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript); 1628 } 1629 addBest(String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1630 private static void addBest(String languageCode, double languagePopulationRaw, String code, 1631 Map<String, CodeAndPopulation> languageToMaxCode) { 1632 if (languageCode.length() == 0) { 1633 throw new IllegalArgumentException(); 1634 } 1635 CodeAndPopulation best = languageToMaxCode.get(languageCode); 1636 if (best == null) { 1637 languageToMaxCode.put(languageCode, best = new CodeAndPopulation()); 1638 } else if (best.population >= languagePopulationRaw) { 1639 return; 1640 } 1641 best.population = languagePopulationRaw; 1642 best.code = code; 1643 } 1644 1645 static class CodeAndPopulation { 1646 String code = null; 1647 double population = Double.NaN; 1648 1649 @Override toString()1650 public String toString() { 1651 return "{" + code + "," + population + "}"; 1652 } 1653 } 1654 1655 static public class GeneralCollator implements Comparator<String> { 1656 static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0); 1657 static RuleBasedCollator UCA = (RuleBasedCollator) Collator 1658 .getInstance(ULocale.ROOT); 1659 static { 1660 UCA.setNumericCollation(true); 1661 } 1662 1663 @Override compare(String s1, String s2)1664 public int compare(String s1, String s2) { 1665 if (s1 == null) { 1666 return s2 == null ? 0 : -1; 1667 } else if (s2 == null) { 1668 return 1; 1669 } 1670 int result = UCA.compare(s1, s2); 1671 if (result != 0) return result; 1672 return cpCompare.compare(s1, s2); 1673 } 1674 } 1675 1676 public static class InverseComparator<T> implements Comparator<T> { 1677 private Comparator<T> other; 1678 InverseComparator()1679 public InverseComparator() { 1680 this.other = null; 1681 } 1682 InverseComparator(Comparator<T> other)1683 public InverseComparator(Comparator<T> other) { 1684 this.other = other; 1685 } 1686 1687 @Override compare(T a, T b)1688 public int compare(T a, T b) { 1689 return other == null 1690 ? ((Comparable) b).compareTo(a) 1691 : other.compare(b, a); 1692 } 1693 } 1694 1695 static Set<String> languagesNeeded = new TreeSet<>( 1696 Arrays 1697 .asList("ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na no rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv" 1698 .split("\\s"))); 1699 1700 /** 1701 * Not called? 1702 */ 1703 @Deprecated generateIso639_2Data(PrintWriter out)1704 private static void generateIso639_2Data(PrintWriter out) { 1705 for (String languageSubtag : sc.getAvailableCodes("language")) { 1706 String alpha3 = Iso639Data.toAlpha3(languageSubtag); 1707 Type type = Iso639Data.getType(languageSubtag); 1708 Scope scope = Iso639Data.getScope(languageSubtag); 1709 if (type != null || alpha3 != null || scope != null) { 1710 out.println("\t\t<languageCode type=\"" + languageSubtag + "\"" + 1711 (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") + 1712 (type == null ? "" : " iso639Type=\"" + type + "\"") + 1713 (scope == null ? "" : " iso639Scope=\"" + scope + "\"") + 1714 "/>"); 1715 } 1716 1717 } 1718 } 1719 1720 static Relation<String, BasicLanguageData> language2BasicLanguageData = Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class); 1721 1722 static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts; 1723 static Map<Pair<String, String>, String> language_script_references = new TreeMap<>(); 1724 1725 static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = SupplementalDataInfo 1726 .getInstance().getLocaleAliasInfo(); 1727 getLanguage2Scripts(Set<RowData> sortedInput)1728 static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException { 1729 language_status_scripts = new TreeMap<>(); 1730 1731 // // get current scripts 1732 // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), TreeSet.class); 1733 // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), TreeSet.class); 1734 // for (String languageSubtag : language2BasicLanguageData.keySet()) { 1735 // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 1736 // for (String script : item.getScripts()) { 1737 // addLanguage2Script(languageSubtag, item.getType(), script); 1738 // } 1739 // } 1740 // } 1741 // System.out.println("Language 2 scripts: " + language_status_scripts); 1742 1743 // #Lcode LanguageName Status Scode ScriptName References 1744 List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data("language_script_raw.txt")); 1745 System.out.println(CldrUtility.LINE_SEPARATOR + "# Problems in language_script_raw.txt" 1746 + CldrUtility.LINE_SEPARATOR); 1747 //int count = -1; 1748 for (List<String> row : input) { 1749 try { 1750 if (row.size() == 0) continue; 1751 //++count; 1752 String language = row.get(0).trim(); 1753 if (language.length() == 0 || language.startsWith("#")) continue; 1754 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2)); 1755 String scripts = row.get(3); 1756 if (!checkCode(LstrType.language, language, row)) continue; 1757 for (String script : scripts.split("\\s+")) { 1758 if (!checkCode(LstrType.script, script, row)) continue; 1759 // if the script is not modern, demote 1760 Info scriptInfo = ScriptMetadata.getInfo(script); 1761 if (scriptInfo == null) { 1762 BadItem.ERROR.toString("illegal script; must be represented in Unicode, remove line or fix", script, row); 1763 continue; 1764 } 1765 IdUsage idUsage = scriptInfo.idUsage; 1766 if (status == BasicLanguageData.Type.primary && idUsage != IdUsage.RECOMMENDED) { 1767 if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) { 1768 BadItem.WARNING.toString("Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge", 1769 idUsage + ", " + script + "=" + getULocaleScriptName(script), row); 1770 } else { 1771 BadItem.ERROR.toString("Script is not modern; make secondary", idUsage + ", " + script + "=" + getULocaleScriptName(script), row); 1772 status = BasicLanguageData.Type.secondary; 1773 } 1774 } 1775 1776 // if the language is not modern, demote 1777 if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) { 1778 BadItem.ERROR.toString("Remove/Change deprecated language", language + " " 1779 + getLanguageName(language) + "; " + LOCALE_ALIAS_INFO.get("language").get(language), row); 1780 continue; 1781 } 1782 if (status == BasicLanguageData.Type.primary && !sc.isModernLanguage(language)) { 1783 BadItem.ERROR.toString("Should be secondary, language is not modern", language + " " + getLanguageName(language), row); 1784 status = BasicLanguageData.Type.secondary; 1785 } 1786 1787 addLanguage2Script(language, status, script); 1788 if (row.size() > 5) { 1789 String reference = row.get(5); 1790 if (reference != null && reference.length() == 0) { 1791 language_script_references.put(new Pair<>(language, script), reference); 1792 } 1793 } 1794 } 1795 } catch (RuntimeException e) { 1796 System.err.println(row); 1797 throw e; 1798 } 1799 } 1800 1801 // System.out.println("Language 2 scripts: " + language_status_scripts); 1802 1803 for (String language : sc.getGoodAvailableCodes("language")) { 1804 if (supplementalData.getDeprecatedInfo("language", language) != null) { 1805 continue; 1806 } 1807 Map<String, String> registryData = sc.getLangData("language", language); 1808 if (registryData != null) { 1809 String suppressScript = registryData.get("Suppress-Script"); 1810 if (suppressScript == null) continue; 1811 if (ScriptMetadata.getInfo(suppressScript) == null) { 1812 // skip, not represented in Unicode 1813 continue; 1814 } 1815 // if there is something already there, we have a problem. 1816 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1817 if (status_scripts == null) { 1818 System.out 1819 .println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript); 1820 } else if (!status_scripts.values().contains(suppressScript)) { 1821 System.out.println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript 1822 + "\tall:\t" + status_scripts.values()); 1823 } else { 1824 // at this point, the suppressScript is in the union of the primary and secondary. 1825 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1826 if (primaryScripts != null && !primaryScripts.contains(suppressScript)) { 1827 System.out.println("Suppress-Script is not in primary: " + language + "\tSuppress-Script:\t" 1828 + suppressScript + "\tprimary:\t" 1829 + primaryScripts); 1830 } 1831 } 1832 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript); 1833 } 1834 } 1835 1836 // remove primaries from secondaries 1837 // check for primaries for scripts 1838 for (String language : language_status_scripts.keySet()) { 1839 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1840 Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary); 1841 if (secondaryScripts == null) continue; 1842 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1843 if (primaryScripts == null) { 1844 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts); 1845 // status_scripts.removeAll(BasicLanguageData.Type.secondary); 1846 if (sc.isModernLanguage(language)) { 1847 BadItem.ERROR.show("modern language without primary script, might need to edit moribund_languages.txt", language + " " 1848 + getLanguageName(language)); 1849 } 1850 } else { 1851 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts); 1852 } 1853 } 1854 1855 // check that every living language in the row data has a script 1856 Set<String> livingLanguagesWithTerritories = new TreeSet<>(); 1857 for (RowData rowData : sortedInput) { 1858 String language = rowData.languageCode; 1859 if (sc.isModernLanguage(language) && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) { 1860 livingLanguagesWithTerritories.add(language); 1861 } 1862 } 1863 for (String language : livingLanguagesWithTerritories) { 1864 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1865 if (status_scripts != null) { 1866 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1867 if (primaryScripts != null && primaryScripts.size() > 0) { 1868 continue; 1869 } 1870 } 1871 if (language.equals("tw")) continue; // TODO load aliases and check... 1872 BadItem.WARNING.show("ISO 639-1/2 language in language-territory list without primary script", language + "\t" + getLanguageName(language)); 1873 } 1874 1875 // System.out.println("Language 2 scripts: " + language_status_scripts); 1876 } 1877 checkScript(String script)1878 private static boolean checkScript(String script) { 1879 // TODO Auto-generated method stub 1880 return false; 1881 } 1882 1883 static Validity VALIDITY = Validity.getInstance(); 1884 checkCode(LstrType type, String code, List<String> sourceLine)1885 private static boolean checkCode(LstrType type, String code, List<String> sourceLine) { 1886 Status validity = VALIDITY.getCodeToStatus(type).get(code); 1887 if (validity == Status.regular) { 1888 return true; 1889 } else if (validity == Status.unknown && type == LstrType.region) { 1890 return true; 1891 } 1892 BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine); 1893 return false; 1894 } 1895 addLanguage2Script(String language, BasicLanguageData.Type type, String script)1896 private static void addLanguage2Script(String language, BasicLanguageData.Type type, String script) { 1897 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1898 if (status_scripts == null) 1899 language_status_scripts.put(language, status_scripts = Relation.of(new TreeMap<BasicLanguageData.Type, Set<String>>(), TreeSet.class)); 1900 status_scripts.put(type, script); 1901 } 1902 addLanguageScriptData()1903 static void addLanguageScriptData() throws IOException { 1904 // check to make sure that every language subtag is in 639-3 1905 Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language"); 1906 // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes); 1907 // iso639_2_missing.removeAll(Iso639Data.getAvailable()); 1908 // iso639_2_missing.remove("root"); 1909 // if (iso639_2_missing.size() != 0) { 1910 // for (String missing : iso639_2_missing){ 1911 // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", " + 1912 // sc.getData("language", missing)); 1913 // } 1914 // } 1915 1916 // Map<String, String> nameToTerritoryCode = new TreeMap(); 1917 // for (String territoryCode : sc.getGoodAvailableCodes("territory")) { 1918 // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), territoryCode); 1919 // } 1920 // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); // 1921 1922 //BasicLanguageData languageData = new BasicLanguageData(); 1923 1924 BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt"); 1925 while (true) { 1926 String line = in.readLine(); 1927 if (line == null) break; 1928 String[] parts = line.split("\\t"); 1929 String alpha3 = parts[0]; 1930 alpha3 = stripBrackets(alpha3); 1931 String languageSubtag = Iso639Data.fromAlpha3(alpha3); 1932 if (languageSubtag == null) { 1933 if (langRegistryCodes.contains(alpha3)) { 1934 languageSubtag = alpha3; 1935 } else { 1936 BadItem.WARNING.show("Language subtag not found on line", alpha3, line); 1937 continue; 1938 } 1939 } 1940 //String name = parts[1]; 1941 Set<String> names = Iso639Data.getNames(languageSubtag); 1942 if (names == null) { 1943 Map<String, String> name2 = sc.getLangData("language", languageSubtag); 1944 if (name2 != null) { 1945 String name3 = name2.get("Description"); 1946 if (name3 != null) { 1947 names = new TreeSet<>(); 1948 names.add(name3); 1949 } 1950 } 1951 } 1952 // if (names == null || !names.contains(name)) { 1953 // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + names); 1954 // } 1955 1956 // names all straight, now get scripts and territories 1957 // [Cyrl]; [Latn] 1958 Set<String> fullScriptList = sc.getGoodAvailableCodes("script"); 1959 1960 String[] scriptList = parts[2].split("[;,]\\s*"); 1961 Set<String> scripts = new TreeSet<>(); 1962 Set<String> scriptsAlt = new TreeSet<>(); 1963 for (String script : scriptList) { 1964 if (script.length() == 0) continue; 1965 boolean alt = false; 1966 if (script.endsWith("*")) { 1967 alt = true; 1968 script = script.substring(0, script.length() - 1); 1969 } 1970 script = stripBrackets(script); 1971 if (!fullScriptList.contains(script)) { 1972 System.out.println("Script <" + script + "> for <" + languageSubtag + "> not found in " 1973 + fullScriptList); 1974 } else if (alt) { 1975 scriptsAlt.add(script); 1976 } else { 1977 scripts.add(script); 1978 } 1979 } 1980 // now territories 1981 Set<String> territories = new TreeSet<>(); 1982 if (parts.length > 4) { 1983 String[] territoryList = parts[4].split("\\s*[;,-]\\s*"); 1984 for (String territoryName : territoryList) { 1985 if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue; 1986 String territoryCode = CountryCodeConverter.getCodeFromName(territoryName, true); 1987 if (territoryCode == null) { 1988 BadItem.ERROR.show("no name found for territory", "<" + territoryName + ">", languageSubtag); 1989 } else { 1990 territories.add(territoryCode); 1991 } 1992 } 1993 } 1994 // <language type="de" scripts="Latn" territories="IT" alt="secondary"/> 1995 // we're going to go ahead and set these all to secondary. 1996 if (scripts.size() != 0) { 1997 language2BasicLanguageData.put(languageSubtag, 1998 new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scripts) 1999 .setTerritories(territories)); 2000 } 2001 if (scriptsAlt.size() != 0) { 2002 language2BasicLanguageData.put(languageSubtag, 2003 new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scriptsAlt) 2004 .setTerritories(territories)); 2005 } 2006 } 2007 in.close(); 2008 2009 // add other data 2010 for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) { 2011 Set<BasicLanguageData> otherData = supplementalData.getBasicLanguageData(languageSubtag); 2012 language2BasicLanguageData.putAll(languageSubtag, otherData); 2013 } 2014 } 2015 2016 // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> language2basicData, String 2017 // comment) { 2018 // // now print 2019 // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class); 2020 // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class); 2021 // 2022 // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->")); 2023 // 2024 // for (String languageSubtag : language2basicData.keySet()) { 2025 // String duplicate = ""; 2026 // // script,territory 2027 // primaryCombos.clear(); 2028 // secondaryCombos.clear(); 2029 // 2030 // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) { 2031 // Set<String> scripts = item.getScripts(); 2032 // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" })); 2033 // for (String script : scripts) { 2034 // Set<String> territories = item.getTerritories(); 2035 // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" })); 2036 // for (String territory : territories) { 2037 // if (item.getType().equals(BasicLanguageData.Type.primary)) { 2038 // primaryCombos.put(script, territory); 2039 // } else { 2040 // secondaryCombos.put(script, territory); 2041 // } 2042 // } 2043 // } 2044 // } 2045 // secondaryCombos.removeAll(primaryCombos); 2046 // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary); 2047 // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(), 2048 // BasicLanguageData.Type.secondary); 2049 // // System.out.println(item.toString(languageSubtag) + duplicate); 2050 // // duplicate = " <!-- " + "**" + " -->"; 2051 // } 2052 // Log.println("\t</languageData>"); 2053 // } 2054 showBasicLanguageData(PrintWriter out, String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2055 private static void showBasicLanguageData(PrintWriter out, String languageSubtag, Relation<String, String> primaryCombos, 2056 Set<String> suppressEmptyScripts, BasicLanguageData.Type type) { 2057 Set<String> scriptsWithSameTerritories = new TreeSet<>(); 2058 Set<String> lastTerritories = Collections.emptySet(); 2059 for (String script : primaryCombos.keySet()) { 2060 Set<String> territories = primaryCombos.getAll(script); 2061 if (lastTerritories == Collections.EMPTY_SET) { 2062 // skip first 2063 } else if (lastTerritories.equals(territories)) { 2064 scriptsWithSameTerritories.add(script); 2065 } else { 2066 showBasicLanguageData2(out, languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, 2067 lastTerritories, type); 2068 scriptsWithSameTerritories.clear(); 2069 } 2070 lastTerritories = territories; 2071 scriptsWithSameTerritories.add(script); 2072 } 2073 showBasicLanguageData2(out, languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, lastTerritories, type); 2074 } 2075 showBasicLanguageData2(PrintWriter out, String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2076 private static void showBasicLanguageData2(PrintWriter out, String languageSubtag, Set<String> scripts, 2077 Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type) { 2078 scripts.remove("Zzzz"); 2079 territories.remove("ZZ"); 2080 if (territories.size() == 0 && suppressEmptyScripts != null) { 2081 scripts.removeAll(suppressEmptyScripts); 2082 } 2083 if (scripts.size() == 0 && territories.size() == 0) return; 2084 out.println("\t\t<language type=\"" + languageSubtag + "\"" + 2085 (scripts.size() == 0 ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") + 2086 (territories.size() == 0 ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") + 2087 (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") + 2088 "/>"); 2089 } 2090 2091 /* 2092 * System.out.println( 2093 * "\t\t<language type=\"" + languageSubtag + "\"" + 2094 * " scripts=\"" + Utility.join(scripts," ") + "\"" + 2095 * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") + 2096 * "/>" 2097 * ); 2098 */ 2099 stripBrackets(String alpha3)2100 private static String stripBrackets(String alpha3) { 2101 if (alpha3.startsWith("[") && alpha3.endsWith("]")) { 2102 alpha3 = alpha3.substring(1, alpha3.length() - 1); 2103 } 2104 return alpha3; 2105 } 2106 2107 static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); 2108 static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH); 2109 static { 2110 nf_no_comma.setGroupingUsed(false); 2111 } 2112 static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH); 2113 formatNumber(double original, int roundDigits, boolean xml)2114 public static String formatNumber(double original, int roundDigits, boolean xml) { 2115 double d = original; 2116 if (roundDigits != 0) { 2117 d = CldrUtility.roundToDecimals(original, roundDigits); 2118 } 2119 if (Double.isNaN(d)) { 2120 d = CldrUtility.roundToDecimals(original, roundDigits); 2121 throw new IllegalArgumentException("Double is NaN"); 2122 } 2123 if (xml) { 2124 return nf_no_comma.format(d); 2125 } 2126 return nf.format(d); 2127 } 2128 formatPercent(double d, int roundDigits, boolean xml)2129 public static String formatPercent(double d, int roundDigits, boolean xml) { 2130 if (roundDigits != 0) { 2131 d = CldrUtility.roundToDecimals(d, roundDigits); 2132 } 2133 if (xml) { 2134 nf_no_comma.setMaximumFractionDigits(roundDigits + 2); 2135 return nf_no_comma.format(d * 100.0); 2136 } 2137 pf.setMaximumFractionDigits(roundDigits + 2); 2138 return pf.format(d); 2139 } 2140 2141 static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer(); 2142 fixLanguageCode(String languageCodeRaw, List<String> row)2143 private static String fixLanguageCode(String languageCodeRaw, List<String> row) { 2144 String languageCode = languageTagCanonicalizer.transform(languageCodeRaw); 2145 if (DEBUG && !languageCode.equals(languageCodeRaw)) { 2146 System.out.println("## " + languageCodeRaw + " => " + languageCode); 2147 } 2148 int bar = languageCode.indexOf('_'); 2149 String script = ""; 2150 if (bar >= 0) { 2151 script = languageCode.substring(bar); 2152 languageCode = languageCode.substring(0, bar); 2153 } 2154 R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("language").get(languageCode); 2155 if (replacement != null) { 2156 String replacementCode = replacement.get0().get(0); 2157 BadItem.ERROR.show("deprecated language code", languageCode + " => " + replacementCode, row); 2158 languageCode = replacementCode; 2159 } 2160 if (!sc.getAvailableCodes("language").contains(languageCode)) { 2161 BadItem.ERROR.show("bad language code", languageCode, row); 2162 } 2163 return languageCode + script; 2164 } 2165 2166 enum BadItem { 2167 ERROR, WARNING, DETAIL; 2168 show(String problem, String details, String... items)2169 void show(String problem, String details, String... items) { 2170 System.out.println(toString(problem, details, items)); 2171 } 2172 show(String problem, String details, List<String> row)2173 void show(String problem, String details, List<String> row) { 2174 System.out.println(toString(problem, details, row)); 2175 } 2176 toString(String problem, String details, String... items)2177 private String toString(String problem, String details, String... items) { 2178 return toString(problem, details, Arrays.asList(items)); 2179 } 2180 toString(String problem, String details, List<String> row)2181 private String toString(String problem, String details, List<String> row) { 2182 return "* " + this 2183 + " *\t" + problem + ":" 2184 + "\t" + details 2185 + (row != null && row.size() > 0 ? "\t" + Joiner.on("\t").join(row) : ""); 2186 } 2187 } 2188 fixCountryCode(String countryCode, List<String> row)2189 private static String fixCountryCode(String countryCode, List<String> row) { 2190 R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("territory").get(countryCode); 2191 if (replacement != null) { 2192 String replacementCode = replacement.get0().get(0); 2193 BadItem.ERROR.show("deprecated territory code", countryCode + " => " + replacementCode, row); 2194 countryCode = replacementCode; 2195 } 2196 if (!sc.getAvailableCodes("territory").contains(countryCode)) { 2197 BadItem.ERROR.show("bad territory code", countryCode, row); 2198 } 2199 return countryCode; 2200 } 2201 getULocaleLocaleName(String languageCode)2202 private static String getULocaleLocaleName(String languageCode) { 2203 return english.getName(languageCode, true); 2204 //return new ULocale(languageCode).getDisplayName(); 2205 } 2206 getULocaleScriptName(String scriptCode)2207 private static String getULocaleScriptName(String scriptCode) { 2208 return english.getName(CLDRFile.SCRIPT_NAME, scriptCode); 2209 // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH); 2210 } 2211 getULocaleCountryName(String countryCode)2212 private static String getULocaleCountryName(String countryCode) { 2213 return english.getName(CLDRFile.TERRITORY_NAME, countryCode); 2214 //return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH); 2215 } 2216 } 2217