1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.lang.invoke.MethodHandles; 6 import java.util.ArrayList; 7 import java.util.Collection; 8 import java.util.Collections; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Locale; 15 import java.util.Map; 16 import java.util.Map.Entry; 17 import java.util.Set; 18 import java.util.TreeMap; 19 import java.util.TreeSet; 20 import java.util.regex.Pattern; 21 22 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo; 23 import org.unicode.cldr.util.CLDRConfig; 24 import org.unicode.cldr.util.CLDRFile; 25 import org.unicode.cldr.util.CLDRPaths; 26 import org.unicode.cldr.util.ChainedMap; 27 import org.unicode.cldr.util.ChainedMap.M3; 28 import org.unicode.cldr.util.DtdType; 29 import org.unicode.cldr.util.Factory; 30 import org.unicode.cldr.util.Pair; 31 import org.unicode.cldr.util.PatternCache; 32 import org.unicode.cldr.util.StandardCodes; 33 import org.unicode.cldr.util.StandardCodes.LstrField; 34 import org.unicode.cldr.util.StandardCodes.LstrType; 35 import org.unicode.cldr.util.SupplementalDataInfo; 36 import org.unicode.cldr.util.Validity; 37 import org.unicode.cldr.util.Validity.Status; 38 import org.unicode.cldr.util.WikiSubdivisionLanguages; 39 import org.unicode.cldr.util.XMLFileReader; 40 import org.unicode.cldr.util.XPathParts; 41 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 42 43 import com.ibm.icu.dev.util.CollectionUtilities; 44 import com.ibm.icu.impl.Relation; 45 import com.ibm.icu.impl.Row.R2; 46 import com.ibm.icu.impl.Utility; 47 import com.ibm.icu.lang.UCharacter; 48 import com.ibm.icu.text.CaseMap; 49 import com.ibm.icu.text.Collator; 50 import com.ibm.icu.text.LocaleDisplayNames; 51 import com.ibm.icu.text.Normalizer2; 52 import com.ibm.icu.text.RuleBasedCollator; 53 import com.ibm.icu.util.ULocale; 54 55 public class SubdivisionNode { 56 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 57 static final Map<String, R2<List<String>, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory"); 58 static final Set<String> containment = SDI.getContainers(); 59 static final Map<String, Map<LstrField, String>> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region); 60 61 static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH); 62 63 static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase(); 64 static final Comparator<String> ROOT_COL; 65 static { 66 RuleBasedCollator _ROOT_COL = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH); 67 _ROOT_COL.setNumericCollation(true); _ROOT_COL.freeze()68 _ROOT_COL.freeze(); 69 ROOT_COL = (Comparator) _ROOT_COL; 70 } 71 static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 72 static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish(); 73 static final Normalizer2 nfc = Normalizer2.getNFCInstance(); 74 convertToCldr(String regionOrSubdivision)75 public static String convertToCldr(String regionOrSubdivision) { 76 return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT) 77 : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT); 78 } 79 80 final SubdivisionSet sset; 81 final String code; 82 final int level; 83 final SubdivisionNode parent; 84 final Map<String, SubdivisionNode> children = new TreeMap<>(ROOT_COL); 85 SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)86 public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) { 87 this.code = code; 88 this.level = parent == null ? -1 : parent.level + 1; 89 this.parent = parent; 90 this.sset = sset; 91 sset.ID_TO_NODE.put(code, this); 92 } 93 addName(String lang, String value)94 public SubdivisionNode addName(String lang, String value) { 95 sset.NAMES.put(code, lang, value); 96 return this; 97 } 98 99 static class SubdivisionSet { 100 101 final M3<String, String, String> NAMES = ChainedMap.of( 102 new TreeMap<String, Object>(), 103 new TreeMap<String, Object>(), 104 String.class); 105 final Map<String, String> TO_COUNTRY_CODE = new TreeMap<String, String>(); 106 final Relation<String, String> ID_SAMPLE = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 107 final Map<String, String> SUB_TO_CAT = new TreeMap<>(); 108 final Relation<String, String> REGION_CONTAINS = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 109 final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>(); 110 111 final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World"); 112 addName(String code, String lang, String value)113 public void addName(String code, String lang, String value) { 114 int parenPos = value.indexOf("(see also separate country"); 115 if (parenPos >= 0) { 116 /* 117 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire" 118 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba" 119 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius" 120 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard" 121 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen" 122 */ 123 // OLD code to guess country from comment 124 // String paren = value.substring(value.length() - 3, value.length() - 1); 125 // if (!paren.equals("BQ") && !paren.equals("SJ")) { 126 // String old = TO_COUNTRY_CODE.get(code); 127 // if (old != null) { 128 // System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren); 129 // } 130 // TO_COUNTRY_CODE.put(code, paren); 131 // } 132 value = value.substring(0, parenPos).trim(); 133 } 134 value = value.replace("*", ""); 135 NAMES.put(code, lang, value); 136 } 137 138 139 140 141 static final String[] CRUFT = { 142 "Emirate", 143 "Parish", 144 "County", 145 "District", 146 "Region", 147 "Province of", 148 "Province", 149 "Republic", 150 ", Barbados", 151 ", Burkina Faso", 152 "Governorate", 153 "Department", 154 "Canton of", 155 "(Région des)", 156 "(Région du)", 157 "(Région de la)", 158 "Autonomous", 159 "Archipelago of", 160 "Canton", 161 "kanton", 162 ", Bahamas", 163 "province", 164 "(Région)", 165 "(Région de l')", 166 ", Cameroon", 167 "State of", 168 "State", 169 "Metropolitan Borough of", 170 "London Borough of", 171 "Royal Borough of", 172 "Borough of", 173 "Borough", 174 "Council of", 175 "Council", 176 "City of", 177 ", The", 178 "prefecture", 179 "Prefecture", 180 "municipality" 181 }; 182 183 static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + CollectionUtilities.join(CRUFT, "|") + "\\b"); 184 static final Pattern BRACKETED = PatternCache.get("\\[.*\\]"); 185 clean(String input)186 static String clean(String input) { 187 if (input == null) { 188 return input; 189 } 190 // Quick & dirty 191 input = BRACKETED.matcher(input).replaceAll(""); 192 input = CRUFT_PATTERN.matcher(input).replaceAll(""); 193 // for (String cruft : CRUFT) { 194 // int pos = input.indexOf(cruft); 195 // if (pos >= 0) { 196 // input = input.substring(0,pos) + input.substring(pos + cruft.length()); 197 // } 198 // } 199 input = input.replace(" ", " "); 200 if (input.endsWith(",")) { 201 input = input.substring(0, input.length() - 1); 202 } 203 return fixName(input); 204 } 205 206 207 appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level)208 private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException { 209 if (name == null) { 210 return; 211 } 212 String cldrCode = convertToCldr(sdCode); 213 String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]"; 214 String oldValue = fileSubdivisions.getStringValue(path); 215 if (oldValue != null) { 216 return; // don't override old values 217 } 218 fileSubdivisions.add(path, name); 219 if (level != null) { 220 fileSubdivisions.addComment(path, level, CommentType.LINE); 221 } 222 } 223 isKosher(String regionCode)224 private boolean isKosher(String regionCode) { 225 if (regionCode.equals("001")) { 226 return false; 227 } 228 if (territoryAliases.containsKey(regionCode) 229 || containment.contains(regionCode) 230 || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) { 231 Set<String> rc = REGION_CONTAINS.get(regionCode); 232 if (rc != null) { 233 throw new IllegalArgumentException("? " + regionCode + ": " + rc); 234 } 235 return false; 236 } 237 return true; 238 } 239 addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)240 private static void addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) { 241 TreeMap<String, SubdivisionNode> temp = new TreeMap<>(ROOT_COL); 242 temp.putAll(children2); 243 ordered.addAll(temp.values()); 244 for (SubdivisionNode n : temp.values()) { 245 if (!n.children.isEmpty()) { 246 addChildren(ordered, n.children); 247 } 248 } 249 } 250 251 static Map<String, String> NAME_CORRECTIONS = new HashMap<>(); 252 // static { 253 // Splitter semi = Splitter.on(';').trimResults(); 254 // for (String s : FileUtilities.in(ISO_COUNTRY_CODES, "en-subdivisions-corrections.txt")) { 255 // if (s.startsWith("#")) { 256 // continue; 257 // } 258 // s = s.trim(); 259 // if (s.isEmpty()) { 260 // continue; 261 // } 262 // List<String> parts = semi.splitToList(s); 263 // NAME_CORRECTIONS.put(convertToCldr(parts.get(0)), parts.get(1)); 264 // } 265 // } 266 267 getBestName(String value, boolean useIso)268 private String getBestName(String value, boolean useIso) { 269 if (value.equals("cnah")) { 270 int debug = 0; 271 } 272 String cldrName = null; 273 cldrName = NAME_CORRECTIONS.get(value); 274 if (cldrName != null) { 275 return fixName(cldrName); 276 } 277 R2<List<String>, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value); 278 if (subdivisionAlias != null) { 279 String country = subdivisionAlias.get0().get(0); 280 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country); 281 if (cldrName != null) { 282 return fixName(cldrName); 283 } 284 } 285 286 287 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value); 288 if (cldrName != null) { 289 return fixName(cldrName); 290 } 291 292 Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value); 293 if (oldAliases != null) { 294 for (String oldAlias : oldAliases) { 295 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias); 296 if (cldrName != null) { 297 return fixName(cldrName); 298 } 299 } 300 } 301 302 if (useIso) { 303 cldrName = getIsoName(value); 304 if (cldrName == null) { 305 cldrName = "UNKNOWN"; 306 //throw new IllegalArgumentException("Failed to find name: " + value); 307 } 308 return fixName(cldrName); 309 } 310 return null; 311 } 312 fixName(String name)313 private static String fixName(String name) { 314 return name == null ? null : nfc.normalize(name.replace('\'', '’').replace(" ", " ").trim()); 315 } 316 SubdivisionSet(String sourceFile)317 public SubdivisionSet(String sourceFile) { 318 319 // <country id="AD" version="16"> 320 // <subdivision-code footnote="*">AD-02</subdivision-code> 321 // <subdivision-locale lang3code="eng" xml:lang="en"> 322 // <subdivision-locale-name>Otago</subdivision-locale-name> 323 324 List<Pair<String, String>> pathValues = XMLFileReader.loadPathValues( 325 sourceFile, 326 new ArrayList<Pair<String, String>>(), false); 327 XPathParts parts = new XPathParts(); 328 int maxIndent = 0; 329 SubdivisionNode lastNode = null; 330 String lastCode = null; 331 Set<String> conflictingTargetCountries = new HashSet<>(); 332 333 for (Pair<String, String> pair : pathValues) { 334 String path = pair.getFirst(); 335 boolean code = path.contains("/subdivision-code"); 336 boolean name = path.contains("/subdivision-locale-name"); 337 boolean nameCat = path.contains("/category-name"); 338 boolean relatedCountry = path.contains("/subdivision-related-country"); 339 340 // <country id="AD" version="16"> 341 // <category id="262"> 342 // <category-name lang3code="fra" xml:lang="fr">paroisse</category-name> 343 // <category-name lang3code="eng" xml:lang="en">parish</category-name> 344 // also languages in region... 345 346 // new XML from ISO, so we don't have to guess the country code: 347 // <subdivision-code footnote="*">NL-BQ1</subdivision-code> 348 // <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, SINT EUSTATIUS AND SABA</subdivision-related-country> 349 350 if (!code && !name && !nameCat && !relatedCountry) { 351 continue; 352 } 353 parts.set(path); 354 String value = pair.getSecond(); 355 if (relatedCountry) { 356 String target = parts.getAttributeValue(-1, "country-id"); 357 // remove conflicting target countries 358 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 359 if (entry.getValue().equals(target)) { 360 conflictingTargetCountries.add(target); 361 TO_COUNTRY_CODE.remove(entry.getKey(), target); // there can be at most one 362 break; 363 } 364 } 365 if (!conflictingTargetCountries.contains(target)) { 366 TO_COUNTRY_CODE.put(lastCode, target); 367 //System.out.println(lastCode + " => " + target); 368 } 369 } else if (name) { 370 int elementNum = -2; 371 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 372 if (lang == null) { 373 lang = parts.getAttributeValue(elementNum, "lang3code"); 374 } 375 addName(lastCode, lang, value); 376 //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t"); 377 } else if (nameCat) { 378 //country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"] 379 int elementNum = -1; 380 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 381 if (lang == null) { 382 lang = parts.getAttributeValue(elementNum, "lang3code"); 383 } 384 String category = parts.getAttributeValue(-2, "id"); 385 addName(category, lang, value); 386 //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t"); 387 } else { 388 int countSubdivision = 0; 389 for (int i = 0; i < parts.size(); ++i) { 390 if (parts.getElement(i).equals("subdivision")) { 391 ++countSubdivision; 392 } 393 } 394 if (maxIndent < countSubdivision) { 395 maxIndent = countSubdivision; 396 } 397 value = convertToCldr(value); 398 if (countSubdivision == 1) { 399 lastNode = addNode(null, value); 400 } else { 401 lastNode = addNode(lastNode, value); 402 } 403 lastCode = value; 404 int subdivisionElement = parts.findElement("subdivision"); 405 String id = parts.getAttributeValue(subdivisionElement, "category-id"); 406 addIdSample(id, value); 407 //<subdivision category-id="262">//<subdivision-code footnote="*">AD-06</subdivision-code> 408 // <subdivision category-id="262"> 409 //output.println(++count + Utility.repeat("\t", indent) + "code=" + value); 410 } 411 } 412 } 413 addIdSample(String id, String value)414 public void addIdSample(String id, String value) { 415 SUB_TO_CAT.put(value, id); 416 ID_SAMPLE.put(getIsoName(id), value); 417 } 418 addNode(SubdivisionNode lastSubdivision, String subdivision)419 final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) { 420 // "NZ-S", x 421 String region = SubdivisionNames.getRegionFromSubdivision(subdivision); 422 REGION_CONTAINS.put(region, subdivision); 423 if (lastSubdivision == null) { 424 lastSubdivision = BASE.children.get(region); 425 if (lastSubdivision == null) { 426 lastSubdivision = new SubdivisionNode(region, BASE, this).addName("en", ENGLISH_ICU.regionDisplayName(region)); 427 BASE.children.put(region, lastSubdivision); 428 } 429 return add(lastSubdivision, subdivision); 430 } 431 add(lastSubdivision, subdivision); 432 return lastSubdivision; 433 } 434 add(SubdivisionNode subdivisionNode1, String subdivision2)435 private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) { 436 SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2); 437 if (subdivisionNode2 == null) { 438 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this); 439 } 440 subdivisionNode1.children.put(subdivision2, subdivisionNode2); 441 return subdivisionNode2; 442 } 443 getName(SubdivisionNode base2)444 private String getName(SubdivisionNode base2) { 445 return getIsoName(base2.code); 446 } 447 getIsoName(String code)448 private String getIsoName(String code) { 449 if (code == null) { 450 return null; 451 } 452 Map<String, String> map = NAMES.get(code); 453 if (map == null) { 454 return "???"; 455 } 456 String name = map.get("en"); 457 if (name != null) { 458 return name; 459 } 460 name = map.get("es"); 461 if (name != null) { 462 return name; 463 } 464 name = map.get("fr"); 465 if (name != null) { 466 return name; 467 } 468 if (name == null) { 469 name = map.entrySet().iterator().next().getValue(); 470 } 471 return name; 472 } print(PrintWriter out)473 public void print(PrintWriter out) { 474 print(out, 0, "", BASE); 475 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 476 out.println(entry.getKey() + "\t" + entry.getValue()); 477 } 478 } print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)479 private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) { 480 if (!prefix.isEmpty()) { 481 prefix += "\t"; 482 } 483 prefix += base2.code; 484 final String indentString = Utility.repeat("\t", 4-indent); 485 out.println(prefix + indentString + getName(base2)); 486 if (base2.children.isEmpty()) { 487 return; 488 } 489 for (SubdivisionNode child : base2.children.values()) { 490 print(out, indent + 1, prefix, child); 491 } 492 } 493 } 494 495 static class SubDivisionExtractor { 496 final SubdivisionSet sdset; 497 final Validity validityFormer; 498 final Map<String, R2<List<String>, String>> subdivisionAliasesFormer; 499 final Relation<String, String> formerRegionToSubdivisions; 500 SubDivisionExtractor(SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)501 public SubDivisionExtractor(SubdivisionSet sdset, 502 Validity validityFormer, 503 Map<String, R2<List<String>, String>> subdivisionAliasesFormer, 504 Relation<String, String> formerRegionToSubdivisions) { 505 this.sdset = sdset; 506 this.validityFormer = validityFormer; 507 this.subdivisionAliasesFormer = subdivisionAliasesFormer; 508 this.formerRegionToSubdivisions = formerRegionToSubdivisions; 509 } 510 printXml(Appendable output)511 void printXml(Appendable output) throws IOException { 512 513 /* 514 <subdivisionContainment> 515 <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand --> 516 <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand --> 517 <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island --> 518 <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island --> 519 </subdivisionContainment> 520 */ 521 output.append( 522 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass()) 523 + "\t<version number=\"$Revision" /*hack to stop SVN changing this*/ + "$\"/>\n" 524 + "\t<subdivisionContainment>\n"); 525 printXml(output, sdset.BASE, 0); 526 output.append("\t</subdivisionContainment>\n</supplementalData>\n"); 527 } 528 529 // private static String header(DtdType type) { 530 // return "<?xml version='1.0' encoding='UTF-8' ?>\n" 531 // + "<!DOCTYPE " + type // supplementalData 532 // + " SYSTEM '../../" + type.dtdPath + "'>\n" // "common/dtd/ldmlSupplemental.dtd" 533 // + "<!--\n" 534 // + "Copyright © 1991-2013 Unicode, Inc.\n" 535 // + "CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)\n" 536 // + "For terms of use, see http://www.unicode.org/copyright.html\n" 537 // + "-->\n"; 538 // } 539 printAliases(Appendable output)540 void printAliases(Appendable output) throws IOException { 541 addAliases(output, sdset.TO_COUNTRY_CODE.keySet()); 542 543 // Get the old validity data 544 Map<Status, Set<String>> oldSubdivisionData = validityFormer.getStatusToCodes(LstrType.subdivision); 545 Set<String> missing = new TreeSet<>(ROOT_COL); 546 missing.addAll(sdset.TO_COUNTRY_CODE.keySet()); 547 Set<String> nowValid = sdset.ID_TO_NODE.keySet(); 548 for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) { 549 Status v = e.getKey(); 550 if (v == Status.unknown) { 551 continue; 552 } 553 Set<String> set = e.getValue(); 554 for (String sdcodeRaw : set) { 555 String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT); 556 // sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2); 557 if (!nowValid.contains(sdcode)) { 558 missing.add(sdcode); 559 } 560 } 561 } 562 missing.removeAll(sdset.TO_COUNTRY_CODE.keySet()); 563 addAliases(output, missing); 564 } 565 addAliases(Appendable output, Set<String> missing)566 private void addAliases(Appendable output, Set<String> missing) throws IOException { 567 for (String toReplace : missing) { 568 List<String> replaceBy = null; 569 String reason = "deprecated"; 570 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace); 571 if (aliasInfo != null) { 572 replaceBy = aliasInfo.get0(); // == null ? null : CollectionUtilities.join(aliasInfo.get0(), " "); 573 reason = aliasInfo.get1(); 574 System.out.println("Adding former alias: " + toReplace + " => " + replaceBy); 575 } else { 576 String replacement = sdset.TO_COUNTRY_CODE.get(toReplace); 577 if (replacement != null) { 578 replaceBy = Collections.singletonList(replacement); 579 reason = "overlong"; 580 System.out.println("Adding country code alias: " + toReplace + " => " + replaceBy); 581 } 582 } 583 addAlias(output, toReplace, replaceBy, reason); 584 } 585 } 586 addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason)587 private void addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason) throws IOException { 588 // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban --> 589 output.append("\t\t\t"); 590 if (replaceBy == null) { 591 output.append("<!-- "); 592 } 593 output.append("<subdivisionAlias" 594 + " type=\"" + toReplace + "\"" 595 + " replacement=\"" + (replaceBy == null ? toReplace.substring(0, 2) + "?" : CollectionUtilities.join(replaceBy, " ")) + "\"" 596 + " reason=\"" + reason + "\"/>" 597 + (replaceBy == null ? " <!- - " : " <!-- ") 598 + sdset.getBestName(toReplace, true) + " => " + (replaceBy == null ? "??" : getBestName(replaceBy, true)) + " -->" 599 + "\n"); 600 } 601 getBestName(List<String> replaceBy, boolean useIso)602 private String getBestName(List<String> replaceBy, boolean useIso) { 603 StringBuilder result = new StringBuilder(); 604 for (String s : replaceBy) { 605 if (result.length() != 0) { 606 result.append(", "); 607 } 608 if (SubdivisionNames.isRegionCode(s)) { 609 result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s)); 610 } else { 611 result.append(sdset.getBestName(s, useIso)); 612 } 613 } 614 return result.toString(); 615 } 616 printXml(Appendable output, SubdivisionNode base2, int indent)617 private void printXml(Appendable output, SubdivisionNode base2, int indent) throws IOException { 618 if (base2.children.isEmpty()) { 619 return; 620 } 621 String type = base2.code; 622 if (base2 != sdset.BASE) { 623 type = convertToCldr(type); 624 output.append("\t\t" + "<subgroup" 625 + " type=\"" + type + "\"" 626 + " contains=\""); 627 boolean first = true; 628 for (String child : base2.children.keySet()) { 629 if (first) { 630 first = false; 631 } else { 632 output.append(' '); 633 } 634 String subregion = convertToCldr(child); 635 output.append(subregion); 636 } 637 output.append("\"/>\n"); 638 } 639 for (SubdivisionNode child : base2.children.values()) { 640 printXml(output, child, indent); 641 } 642 } 643 printSamples(Appendable pw)644 public void printSamples(Appendable pw) throws IOException { 645 Set<String> seen = new HashSet<>(); 646 for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) { 647 pw.append(entry.getKey()); 648 //int max = 10; 649 seen.clear(); 650 for (String sample : entry.getValue()) { 651 String region = sample.substring(0, 2); 652 if (seen.contains(region)) { 653 continue; 654 } 655 seen.add(region); 656 pw.append(";\t" + ENGLISH_ICU.regionDisplayName(region) + ": " + sdset.getIsoName(sample) 657 + " (" + sample + ")"); 658 //if (--max < 0) break; 659 } 660 pw.append(System.lineSeparator()); 661 } 662 } 663 printEnglishComp(Appendable output)664 public void printEnglishComp(Appendable output) throws IOException { 665 Set<String> countEqual = new TreeSet<>(); 666 String lastCC = null; 667 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n"); 668 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 669 final String countryCode = entry.getKey(); 670 if (!countryCode.equals(lastCC)) { 671 if (lastCC != null && countEqual.size() != 0) { 672 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n"); 673 } 674 countEqual.clear(); 675 ; 676 lastCC = countryCode; 677 } 678 for (String value : entry.getValue()) { 679 String cldrName = sdset.getBestName(value, false); 680 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 681 final String iso = sdset.getIsoName(value); 682 if (iso.equals(wiki)) { 683 countEqual.add(iso); 684 continue; 685 } 686 output.append( 687 ENGLISH_ICU.regionDisplayName(countryCode) 688 // + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 689 + "\t" + cldrName 690 + "\t" + value 691 + "\t" + iso 692 + "\t" + wiki 693 + "\n"); 694 } 695 } 696 if (countEqual.size() != 0) { 697 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n"); 698 } 699 } 700 printEnglishCompFull(Appendable output)701 public void printEnglishCompFull(Appendable output) throws IOException { 702 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n"); 703 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 704 final String countryCode = entry.getKey(); 705 for (String value : entry.getValue()) { 706 String cldrName = sdset.getBestName(value, false); 707 //getBestName(value); 708 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 709 final String iso = sdset.getIsoName(value); 710 output.append( 711 ENGLISH_ICU.regionDisplayName(countryCode) 712 // + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 713 + "\t" + value 714 + "\t" + cldrName 715 + "\t" + iso 716 + "\t" + wiki 717 + "\n"); 718 } 719 } 720 } 721 printEnglish(PrintWriter output)722 public void printEnglish(PrintWriter output) throws IOException { 723 TreeSet<String> allRegions = new TreeSet<>(); 724 allRegions.addAll(codeToData.keySet()); 725 allRegions.addAll(formerRegionToSubdivisions.keySet()); // override 726 727 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 728 CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false); 729 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 730 731 // <subdivisions> 732 // <subdivisiontype="NZ-AUK">Auckland</territory> 733 // output.append( 734 // DtdType.ldml.header(MethodHandles.lookup().lookupClass()) 735 // + "\t<identity>\n" 736 // + "\t\t<version number=\"$Revision" /*hack to stop SVN changing this*/ + "$\"/>\n" 737 // + "\t\t<language type=\"en\"/>\n" 738 // + "\t</identity>\n" 739 // + "\t<localeDisplayNames>\n" 740 // + "\t\t<subdivisions>\n"); 741 Set<String> skipped = new LinkedHashSet<>(); 742 743 for (String regionCode : allRegions) { 744 if (regionCode.equals("FR")) { 745 int debug = 0; 746 } 747 if (!sdset.isKosher(regionCode)) { 748 if (regionCode.length() != 3) { 749 skipped.add(regionCode); 750 } 751 continue; 752 } 753 Set<String> remainder = formerRegionToSubdivisions.get(regionCode); 754 remainder = remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder); 755 756 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode); 757 // output.append("\t\t<!-- ") 758 // .append(convertToCldr(regionCode)).append(" : ") 759 // .append(TransliteratorUtilities.toXML.transform(ENGLISH_ICU.regionDisplayName(regionCode))); 760 if (regionNode == null) { 761 // output.append(" : NO SUBDIVISIONS -->\n"); 762 continue; 763 } 764 // output.append(" -->\n"); 765 766 Set<SubdivisionNode> ordered = new LinkedHashSet<>(); 767 SubdivisionSet.addChildren(ordered, regionNode.children); 768 769 for (SubdivisionNode node : ordered) { 770 final String sdCode = node.code; 771 String name = sdset.getBestName(sdCode, true); 772 String upper = UCharacter.toUpperCase(name); 773 String title = SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(Locale.ROOT, null, name); 774 if (name.equals(upper) || !name.equals(title)) { 775 System.out.println("Suspicious name: " + name); 776 } 777 778 SubdivisionNode sd = sdset.ID_TO_NODE.get(sdCode); 779 780 // String level = sd.level == 1 ? "" : "\t<!-- in " + sd.parent.code 781 // + " : " + TransliteratorUtilities.toXML.transform(sdset.getBestName(sd.parent.code, true)) + " -->"; 782 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null); 783 remainder.remove(sdCode); 784 } 785 for (String sdCode : remainder) { 786 String name = sdset.getBestName(sdCode, true); 787 if (!name.equals("???")) { 788 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, "\t<!-- deprecated -->"); 789 } 790 } 791 } 792 // output.append( 793 // "\t\t</subdivisions>\n" 794 // + "\t</localeDisplayNames>\n" 795 // + "</ldml>"); 796 System.out.println("Skipping: " + skipped); 797 // if (!missing.isEmpty()) { 798 // throw new IllegalArgumentException("No name for: " + missing.size() + ", " + missing); 799 // } 800 fileSubdivisions.write(output); 801 } 802 printMissingMIDs(PrintWriter pw)803 public void printMissingMIDs(PrintWriter pw) { 804 // for (Entry<String, String> entry : WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) { 805 // String mid = entry.getValue(); 806 // if (!mid.isEmpty()) { 807 // continue; 808 // } 809 // String subCode = entry.getKey(); 810 // String wiki = clean(getWikiName(subCode)); 811 // String iso = clean(getIsoName(subCode)); 812 // String countryCode = subCode.substring(0, 2); 813 // String cat = SUB_TO_CAT.get(subCode); 814 // String catName = getIsoName(cat); 815 // pw.append( 816 // ENGLISH_ICU.regionDisplayName(countryCode) 817 // + "\t" + mid 818 // + "\t" + subCode 819 // + "\t" + catName 820 // + "\t" + wiki 821 // + "\t" + iso 822 // + "\n" 823 // ); 824 // } 825 } 826 } 827 }