1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.lang.invoke.MethodHandles; 6 import java.util.ArrayList; 7 import java.util.Collection; 8 import java.util.Collections; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Locale; 15 import java.util.Map; 16 import java.util.Map.Entry; 17 import java.util.Set; 18 import java.util.TreeMap; 19 import java.util.TreeSet; 20 import java.util.regex.Pattern; 21 22 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo; 23 import org.unicode.cldr.util.CLDRConfig; 24 import org.unicode.cldr.util.CLDRFile; 25 import org.unicode.cldr.util.CLDRPaths; 26 import org.unicode.cldr.util.ChainedMap; 27 import org.unicode.cldr.util.ChainedMap.M3; 28 import org.unicode.cldr.util.DtdType; 29 import org.unicode.cldr.util.Factory; 30 import org.unicode.cldr.util.Pair; 31 import org.unicode.cldr.util.PatternCache; 32 import org.unicode.cldr.util.StandardCodes; 33 import org.unicode.cldr.util.StandardCodes.LstrField; 34 import org.unicode.cldr.util.StandardCodes.LstrType; 35 import org.unicode.cldr.util.SupplementalDataInfo; 36 import org.unicode.cldr.util.Validity; 37 import org.unicode.cldr.util.Validity.Status; 38 import org.unicode.cldr.util.XMLFileReader; 39 import org.unicode.cldr.util.XPathParts; 40 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 41 42 import com.google.common.base.Joiner; 43 import com.ibm.icu.impl.Relation; 44 import com.ibm.icu.impl.Row.R2; 45 import com.ibm.icu.impl.Utility; 46 import com.ibm.icu.lang.UCharacter; 47 import com.ibm.icu.text.CaseMap; 48 import com.ibm.icu.text.LocaleDisplayNames; 49 import com.ibm.icu.text.Normalizer2; 50 import com.ibm.icu.util.ULocale; 51 52 public class SubdivisionNode { 53 private static final Comparator<String> COMPARATOR_ROOT = CLDRConfig.getInstance().getComparatorRoot(); 54 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 55 static final Map<String, R2<List<String>, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory"); 56 static final Set<String> containment = SDI.getContainers(); 57 static final Map<String, Map<LstrField, String>> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region); 58 59 static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH); 60 61 static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase(); 62 static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 63 static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish(); 64 static final Normalizer2 nfc = Normalizer2.getNFCInstance(); 65 convertToCldr(String regionOrSubdivision)66 public static String convertToCldr(String regionOrSubdivision) { 67 return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT) 68 : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT); 69 } 70 71 final SubdivisionSet sset; 72 final String code; 73 final int level; 74 final SubdivisionNode parent; 75 final Map<String, SubdivisionNode> children = new TreeMap<>(COMPARATOR_ROOT); 76 SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)77 public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) { 78 this.code = code; 79 this.level = parent == null ? -1 : parent.level + 1; 80 this.parent = parent; 81 this.sset = sset; 82 sset.ID_TO_NODE.put(code, this); 83 } 84 addName(String lang, String value)85 public SubdivisionNode addName(String lang, String value) { 86 sset.NAMES.put(code, lang, value); 87 return this; 88 } 89 90 static class SubdivisionSet { 91 92 final M3<String, String, String> NAMES = ChainedMap.of( 93 new TreeMap<String, Object>(), 94 new TreeMap<String, Object>(), 95 String.class); 96 final Map<String, String> TO_COUNTRY_CODE = new TreeMap<>(); 97 final Relation<String, String> ID_SAMPLE = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 98 final Map<String, String> SUB_TO_CAT = new TreeMap<>(); 99 final Relation<String, String> REGION_CONTAINS = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 100 final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>(); 101 102 final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World"); 103 addName(String code, String lang, String value)104 public void addName(String code, String lang, String value) { 105 int parenPos = value.indexOf("(see also separate country"); 106 if (parenPos >= 0) { 107 /* 108 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire" 109 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba" 110 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius" 111 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard" 112 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen" 113 */ 114 // OLD code to guess country from comment 115 // String paren = value.substring(value.length() - 3, value.length() - 1); 116 // if (!paren.equals("BQ") && !paren.equals("SJ")) { 117 // String old = TO_COUNTRY_CODE.get(code); 118 // if (old != null) { 119 // System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren); 120 // } 121 // TO_COUNTRY_CODE.put(code, paren); 122 // } 123 value = value.substring(0, parenPos).trim(); 124 } 125 value = value.replace("*", ""); 126 NAMES.put(code, lang, value); 127 } 128 129 130 131 132 static final String[] CRUFT = { 133 "Emirate", 134 "Parish", 135 "County", 136 "District", 137 "Region", 138 "Province of", 139 "Province", 140 "Republic", 141 ", Barbados", 142 ", Burkina Faso", 143 "Governorate", 144 "Department", 145 "Canton of", 146 "(Région des)", 147 "(Région du)", 148 "(Région de la)", 149 "Autonomous", 150 "Archipelago of", 151 "Canton", 152 "kanton", 153 ", Bahamas", 154 "province", 155 "(Région)", 156 "(Région de l')", 157 ", Cameroon", 158 "State of", 159 "State", 160 "Metropolitan Borough of", 161 "London Borough of", 162 "Royal Borough of", 163 "Borough of", 164 "Borough", 165 "Council of", 166 "Council", 167 "City of", 168 ", The", 169 "prefecture", 170 "Prefecture", 171 "municipality" 172 }; 173 174 static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b"); 175 static final Pattern BRACKETED = PatternCache.get("\\[.*\\]"); 176 clean(String input)177 static String clean(String input) { 178 if (input == null) { 179 return input; 180 } 181 // Quick & dirty 182 input = BRACKETED.matcher(input).replaceAll(""); 183 input = CRUFT_PATTERN.matcher(input).replaceAll(""); 184 // for (String cruft : CRUFT) { 185 // int pos = input.indexOf(cruft); 186 // if (pos >= 0) { 187 // input = input.substring(0,pos) + input.substring(pos + cruft.length()); 188 // } 189 // } 190 input = input.replace(" ", " "); 191 if (input.endsWith(",")) { 192 input = input.substring(0, input.length() - 1); 193 } 194 return fixName(input); 195 } 196 197 198 appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level)199 private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException { 200 if (name == null) { 201 return; 202 } 203 String cldrCode = convertToCldr(sdCode); 204 String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]"; 205 String oldValue = fileSubdivisions.getStringValue(path); 206 if (oldValue != null) { 207 return; // don't override old values 208 } 209 fileSubdivisions.add(path, name); 210 if (level != null) { 211 fileSubdivisions.addComment(path, level, CommentType.LINE); 212 } 213 } 214 isKosher(String regionCode)215 private boolean isKosher(String regionCode) { 216 if (regionCode.equals("001")) { 217 return false; 218 } 219 if (territoryAliases.containsKey(regionCode) 220 || containment.contains(regionCode) 221 || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) { 222 Set<String> rc = REGION_CONTAINS.get(regionCode); 223 if (rc != null) { 224 throw new IllegalArgumentException("? " + regionCode + ": " + rc); 225 } 226 return false; 227 } 228 return true; 229 } 230 addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)231 private static void addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) { 232 TreeMap<String, SubdivisionNode> temp = new TreeMap<>(COMPARATOR_ROOT); 233 temp.putAll(children2); 234 ordered.addAll(temp.values()); 235 for (SubdivisionNode n : temp.values()) { 236 if (!n.children.isEmpty()) { 237 addChildren(ordered, n.children); 238 } 239 } 240 } 241 242 static Map<String, String> NAME_CORRECTIONS = new HashMap<>(); 243 getBestName(String value, boolean useIso)244 private String getBestName(String value, boolean useIso) { 245 String cldrName = null; 246 cldrName = NAME_CORRECTIONS.get(value); 247 if (cldrName != null) { 248 return fixName(cldrName); 249 } 250 R2<List<String>, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value); 251 if (subdivisionAlias != null) { 252 String country = subdivisionAlias.get0().get(0); 253 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country); 254 if (cldrName != null) { 255 return fixName(cldrName); 256 } 257 } 258 259 260 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value); 261 if (cldrName != null) { 262 return fixName(cldrName); 263 } 264 265 Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value); 266 if (oldAliases != null) { 267 for (String oldAlias : oldAliases) { 268 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias); 269 if (cldrName != null) { 270 return fixName(cldrName); 271 } 272 } 273 } 274 275 if (useIso) { 276 cldrName = getIsoName(value); 277 if (cldrName == null) { 278 cldrName = "UNKNOWN"; 279 //throw new IllegalArgumentException("Failed to find name: " + value); 280 } 281 return fixName(cldrName); 282 } 283 return null; 284 } 285 fixName(String name)286 private static String fixName(String name) { 287 return name == null ? null : nfc.normalize(name.replace('\'', '’').replace(" ", " ").trim()); 288 } 289 SubdivisionSet(String sourceFile)290 public SubdivisionSet(String sourceFile) { 291 292 // <country id="AD" version="16"> 293 // <subdivision-code footnote="*">AD-02</subdivision-code> 294 // <subdivision-locale lang3code="eng" xml:lang="en"> 295 // <subdivision-locale-name>Otago</subdivision-locale-name> 296 297 List<Pair<String, String>> pathValues = XMLFileReader.loadPathValues( 298 sourceFile, 299 new ArrayList<Pair<String, String>>(), false); 300 int maxIndent = 0; 301 SubdivisionNode lastNode = null; 302 String lastCode = null; 303 Set<String> conflictingTargetCountries = new HashSet<>(); 304 305 for (Pair<String, String> pair : pathValues) { 306 String path = pair.getFirst(); 307 boolean code = path.contains("/subdivision-code"); 308 boolean name = path.contains("/subdivision-locale-name"); 309 boolean nameCat = path.contains("/category-name"); 310 boolean relatedCountry = path.contains("/subdivision-related-country"); 311 312 // <country id="AD" version="16"> 313 // <category id="262"> 314 // <category-name lang3code="fra" xml:lang="fr">paroisse</category-name> 315 // <category-name lang3code="eng" xml:lang="en">parish</category-name> 316 // also languages in region... 317 318 // new XML from ISO, so we don't have to guess the country code: 319 // <subdivision-code footnote="*">NL-BQ1</subdivision-code> 320 // <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, SINT EUSTATIUS AND SABA</subdivision-related-country> 321 322 if (!code && !name && !nameCat && !relatedCountry) { 323 continue; 324 } 325 XPathParts parts = XPathParts.getFrozenInstance(path); 326 String value = pair.getSecond(); 327 if (relatedCountry) { 328 String target = parts.getAttributeValue(-1, "country-id"); 329 // remove conflicting target countries 330 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 331 if (entry.getValue().equals(target)) { 332 conflictingTargetCountries.add(target); 333 TO_COUNTRY_CODE.remove(entry.getKey(), target); // there can be at most one 334 break; 335 } 336 } 337 if (!conflictingTargetCountries.contains(target)) { 338 TO_COUNTRY_CODE.put(lastCode, target); 339 //System.out.println(lastCode + " => " + target); 340 } 341 } else if (name) { 342 int elementNum = -2; 343 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 344 if (lang == null) { 345 lang = parts.getAttributeValue(elementNum, "lang3code"); 346 } 347 addName(lastCode, lang, value); 348 //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t"); 349 } else if (nameCat) { 350 //country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"] 351 int elementNum = -1; 352 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 353 if (lang == null) { 354 lang = parts.getAttributeValue(elementNum, "lang3code"); 355 } 356 String category = parts.getAttributeValue(-2, "id"); 357 addName(category, lang, value); 358 //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t"); 359 } else { 360 int countSubdivision = 0; 361 for (int i = 0; i < parts.size(); ++i) { 362 if (parts.getElement(i).equals("subdivision")) { 363 ++countSubdivision; 364 } 365 } 366 if (maxIndent < countSubdivision) { 367 maxIndent = countSubdivision; 368 } 369 value = convertToCldr(value); 370 if (countSubdivision == 1) { 371 lastNode = addNode(null, value); 372 } else { 373 lastNode = addNode(lastNode, value); 374 } 375 lastCode = value; 376 int subdivisionElement = parts.findElement("subdivision"); 377 String id = parts.getAttributeValue(subdivisionElement, "category-id"); 378 addIdSample(id, value); 379 //<subdivision category-id="262">//<subdivision-code footnote="*">AD-06</subdivision-code> 380 // <subdivision category-id="262"> 381 //output.println(++count + Utility.repeat("\t", indent) + "code=" + value); 382 } 383 } 384 } 385 addIdSample(String id, String value)386 public void addIdSample(String id, String value) { 387 SUB_TO_CAT.put(value, id); 388 ID_SAMPLE.put(getIsoName(id), value); 389 } 390 addNode(SubdivisionNode lastSubdivision, String subdivision)391 final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) { 392 // "NZ-S", x 393 String region = SubdivisionNames.getRegionFromSubdivision(subdivision); 394 REGION_CONTAINS.put(region, subdivision); 395 if (lastSubdivision == null) { 396 lastSubdivision = BASE.children.get(region); 397 if (lastSubdivision == null) { 398 lastSubdivision = new SubdivisionNode(region, BASE, this).addName("en", ENGLISH_ICU.regionDisplayName(region)); 399 BASE.children.put(region, lastSubdivision); 400 } 401 return add(lastSubdivision, subdivision); 402 } 403 add(lastSubdivision, subdivision); 404 return lastSubdivision; 405 } 406 add(SubdivisionNode subdivisionNode1, String subdivision2)407 private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) { 408 SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2); 409 if (subdivisionNode2 == null) { 410 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this); 411 } 412 subdivisionNode1.children.put(subdivision2, subdivisionNode2); 413 return subdivisionNode2; 414 } 415 getName(SubdivisionNode base2)416 private String getName(SubdivisionNode base2) { 417 return getIsoName(base2.code); 418 } 419 getIsoName(String code)420 private String getIsoName(String code) { 421 if (code == null) { 422 return null; 423 } 424 Map<String, String> map = NAMES.get(code); 425 if (map == null) { 426 return "???"; 427 } 428 String name = map.get("en"); 429 if (name != null) { 430 return name; 431 } 432 name = map.get("es"); 433 if (name != null) { 434 return name; 435 } 436 name = map.get("fr"); 437 if (name != null) { 438 return name; 439 } 440 if (name == null) { 441 name = map.entrySet().iterator().next().getValue(); 442 } 443 return name; 444 } print(PrintWriter out)445 public void print(PrintWriter out) { 446 print(out, 0, "", BASE); 447 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 448 out.println(entry.getKey() + "\t" + entry.getValue()); 449 } 450 } print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)451 private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) { 452 if (!prefix.isEmpty()) { 453 prefix += "\t"; 454 } 455 prefix += base2.code; 456 final String indentString = Utility.repeat("\t", 4-indent); 457 out.println(prefix + indentString + getName(base2)); 458 if (base2.children.isEmpty()) { 459 return; 460 } 461 for (SubdivisionNode child : base2.children.values()) { 462 print(out, indent + 1, prefix, child); 463 } 464 } 465 } 466 467 static class SubDivisionExtractor { 468 final SubdivisionSet sdset; 469 final Validity validityFormer; 470 final Map<String, R2<List<String>, String>> subdivisionAliasesFormer; 471 final Relation<String, String> formerRegionToSubdivisions; 472 SubDivisionExtractor(SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)473 public SubDivisionExtractor(SubdivisionSet sdset, 474 Validity validityFormer, 475 Map<String, R2<List<String>, String>> subdivisionAliasesFormer, 476 Relation<String, String> formerRegionToSubdivisions) { 477 this.sdset = sdset; 478 this.validityFormer = validityFormer; 479 this.subdivisionAliasesFormer = subdivisionAliasesFormer; 480 this.formerRegionToSubdivisions = formerRegionToSubdivisions; 481 } 482 printXml(Appendable output)483 void printXml(Appendable output) throws IOException { 484 485 /* 486 <subdivisionContainment> 487 <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand --> 488 <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand --> 489 <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island --> 490 <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island --> 491 </subdivisionContainment> 492 */ 493 output.append( 494 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass()) 495 + "\t<version number=\"$Revision" + "$\"/>\n" 496 + "\t<subdivisionContainment>\n"); 497 printXml(output, sdset.BASE, 0); 498 output.append("\t</subdivisionContainment>\n</supplementalData>\n"); 499 } 500 printAliases(Appendable output)501 void printAliases(Appendable output) throws IOException { 502 addAliases(output, sdset.TO_COUNTRY_CODE.keySet()); 503 504 // Get the old validity data 505 Map<Status, Set<String>> oldSubdivisionData = validityFormer.getStatusToCodes(LstrType.subdivision); 506 Set<String> missing = new TreeSet<>(COMPARATOR_ROOT); 507 missing.addAll(sdset.TO_COUNTRY_CODE.keySet()); 508 Set<String> nowValid = sdset.ID_TO_NODE.keySet(); 509 for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) { 510 Status v = e.getKey(); 511 if (v == Status.unknown) { 512 continue; 513 } 514 Set<String> set = e.getValue(); 515 for (String sdcodeRaw : set) { 516 String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT); 517 // sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2); 518 if (!nowValid.contains(sdcode)) { 519 missing.add(sdcode); 520 } 521 } 522 } 523 missing.removeAll(sdset.TO_COUNTRY_CODE.keySet()); 524 addAliases(output, missing); 525 } 526 addAliases(Appendable output, Set<String> missing)527 private void addAliases(Appendable output, Set<String> missing) throws IOException { 528 for (String toReplace : missing) { 529 List<String> replaceBy = null; 530 String reason = "deprecated"; 531 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace); 532 if (aliasInfo != null) { 533 replaceBy = aliasInfo.get0(); 534 reason = aliasInfo.get1(); 535 System.out.println("Adding former alias: " + toReplace + " => " + replaceBy); 536 } else { 537 String replacement = sdset.TO_COUNTRY_CODE.get(toReplace); 538 if (replacement != null) { 539 replaceBy = Collections.singletonList(replacement); 540 reason = "overlong"; 541 System.out.println("Adding country code alias: " + toReplace + " => " + replaceBy); 542 } 543 } 544 addAlias(output, toReplace, replaceBy, reason); 545 } 546 } 547 addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason)548 private void addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason) throws IOException { 549 // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban --> 550 output.append("\t\t\t"); 551 if (replaceBy == null) { 552 output.append("<!-- "); 553 } 554 output.append("<subdivisionAlias" 555 + " type=\"" + toReplace + "\"" 556 + " replacement=\"" + (replaceBy == null ? toReplace.substring(0, 2) + "?" : 557 Joiner.on(" ").join(replaceBy)) + "\"" 558 + " reason=\"" + reason + "\"/>" 559 + (replaceBy == null ? " <!- - " : " <!-- ") 560 + sdset.getBestName(toReplace, true) + " => " + (replaceBy == null ? "??" : getBestName(replaceBy, true)) + " -->" 561 + "\n"); 562 } 563 getBestName(List<String> replaceBy, boolean useIso)564 private String getBestName(List<String> replaceBy, boolean useIso) { 565 StringBuilder result = new StringBuilder(); 566 for (String s : replaceBy) { 567 if (result.length() != 0) { 568 result.append(", "); 569 } 570 if (SubdivisionNames.isRegionCode(s)) { 571 result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s)); 572 } else { 573 result.append(sdset.getBestName(s, useIso)); 574 } 575 } 576 return result.toString(); 577 } 578 printXml(Appendable output, SubdivisionNode base2, int indent)579 private void printXml(Appendable output, SubdivisionNode base2, int indent) throws IOException { 580 if (base2.children.isEmpty()) { 581 return; 582 } 583 String type = base2.code; 584 if (base2 != sdset.BASE) { 585 type = convertToCldr(type); 586 output.append("\t\t" + "<subgroup" 587 + " type=\"" + type + "\"" 588 + " contains=\""); 589 boolean first = true; 590 for (String child : base2.children.keySet()) { 591 if (first) { 592 first = false; 593 } else { 594 output.append(' '); 595 } 596 String subregion = convertToCldr(child); 597 output.append(subregion); 598 } 599 output.append("\"/>\n"); 600 } 601 for (SubdivisionNode child : base2.children.values()) { 602 printXml(output, child, indent); 603 } 604 } 605 printSamples(Appendable pw)606 public void printSamples(Appendable pw) throws IOException { 607 Set<String> seen = new HashSet<>(); 608 for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) { 609 pw.append(entry.getKey()); 610 //int max = 10; 611 seen.clear(); 612 for (String sample : entry.getValue()) { 613 String region = sample.substring(0, 2); 614 if (seen.contains(region)) { 615 continue; 616 } 617 seen.add(region); 618 pw.append(";\t" + ENGLISH_ICU.regionDisplayName(region) + ": " + sdset.getIsoName(sample) 619 + " (" + sample + ")"); 620 //if (--max < 0) break; 621 } 622 pw.append(System.lineSeparator()); 623 } 624 } 625 printEnglishComp(Appendable output)626 public void printEnglishComp(Appendable output) throws IOException { 627 Set<String> countEqual = new TreeSet<>(); 628 String lastCC = null; 629 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n"); 630 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 631 final String countryCode = entry.getKey(); 632 if (!countryCode.equals(lastCC)) { 633 if (lastCC != null && countEqual.size() != 0) { 634 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n"); 635 } 636 countEqual.clear(); 637 638 lastCC = countryCode; 639 } 640 for (String value : entry.getValue()) { 641 String cldrName = sdset.getBestName(value, false); 642 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 643 final String iso = sdset.getIsoName(value); 644 if (iso.equals(wiki)) { 645 countEqual.add(iso); 646 continue; 647 } 648 output.append( 649 ENGLISH_ICU.regionDisplayName(countryCode) 650 // + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 651 + "\t" + cldrName 652 + "\t" + value 653 + "\t" + iso 654 + "\t" + wiki 655 + "\n"); 656 } 657 } 658 if (countEqual.size() != 0) { 659 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n"); 660 } 661 } 662 printEnglishCompFull(Appendable output)663 public void printEnglishCompFull(Appendable output) throws IOException { 664 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n"); 665 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 666 final String countryCode = entry.getKey(); 667 for (String value : entry.getValue()) { 668 String cldrName = sdset.getBestName(value, false); 669 //getBestName(value); 670 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 671 final String iso = sdset.getIsoName(value); 672 output.append( 673 ENGLISH_ICU.regionDisplayName(countryCode) 674 // + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 675 + "\t" + value 676 + "\t" + cldrName 677 + "\t" + iso 678 + "\t" + wiki 679 + "\n"); 680 } 681 } 682 } 683 printEnglish(PrintWriter output)684 public void printEnglish(PrintWriter output) throws IOException { 685 TreeSet<String> allRegions = new TreeSet<>(); 686 allRegions.addAll(codeToData.keySet()); 687 allRegions.addAll(formerRegionToSubdivisions.keySet()); // override 688 689 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 690 CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false); 691 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 692 693 Set<String> skipped = new LinkedHashSet<>(); 694 695 for (String regionCode : allRegions) { 696 if (!sdset.isKosher(regionCode)) { 697 if (regionCode.length() != 3) { 698 skipped.add(regionCode); 699 } 700 continue; 701 } 702 Set<String> remainder = formerRegionToSubdivisions.get(regionCode); 703 remainder = remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder); 704 705 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode); 706 if (regionNode == null) { 707 continue; 708 } 709 710 Set<SubdivisionNode> ordered = new LinkedHashSet<>(); 711 SubdivisionSet.addChildren(ordered, regionNode.children); 712 713 for (SubdivisionNode node : ordered) { 714 final String sdCode = node.code; 715 String name = sdset.getBestName(sdCode, true); 716 String upper = UCharacter.toUpperCase(name); 717 String title = SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(Locale.ROOT, null, name); 718 if (name.equals(upper) || !name.equals(title)) { 719 System.out.println("Suspicious name: " + name); 720 } 721 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null); 722 remainder.remove(sdCode); 723 } 724 for (String sdCode : remainder) { 725 String name = sdset.getBestName(sdCode, true); 726 if (!name.equals("???")) { 727 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, "\t<!-- deprecated -->"); 728 } 729 } 730 } 731 System.out.println("Skipping: " + skipped); 732 fileSubdivisions.write(output); 733 } 734 printMissingMIDs(PrintWriter pw)735 public void printMissingMIDs(PrintWriter pw) { 736 // for (Entry<String, String> entry : WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) { 737 // String mid = entry.getValue(); 738 // if (!mid.isEmpty()) { 739 // continue; 740 // } 741 // String subCode = entry.getKey(); 742 // String wiki = clean(getWikiName(subCode)); 743 // String iso = clean(getIsoName(subCode)); 744 // String countryCode = subCode.substring(0, 2); 745 // String cat = SUB_TO_CAT.get(subCode); 746 // String catName = getIsoName(cat); 747 // pw.append( 748 // ENGLISH_ICU.regionDisplayName(countryCode) 749 // + "\t" + mid 750 // + "\t" + subCode 751 // + "\t" + catName 752 // + "\t" + wiki 753 // + "\t" + iso 754 // + "\n" 755 // ); 756 // } 757 } 758 } 759 }