• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.lang.invoke.MethodHandles;
6 import java.util.ArrayList;
7 import java.util.Collection;
8 import java.util.Collections;
9 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.Map;
16 import java.util.Map.Entry;
17 import java.util.Set;
18 import java.util.TreeMap;
19 import java.util.TreeSet;
20 import java.util.regex.Pattern;
21 
22 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo;
23 import org.unicode.cldr.util.CLDRConfig;
24 import org.unicode.cldr.util.CLDRFile;
25 import org.unicode.cldr.util.CLDRPaths;
26 import org.unicode.cldr.util.ChainedMap;
27 import org.unicode.cldr.util.ChainedMap.M3;
28 import org.unicode.cldr.util.DtdType;
29 import org.unicode.cldr.util.Factory;
30 import org.unicode.cldr.util.Pair;
31 import org.unicode.cldr.util.PatternCache;
32 import org.unicode.cldr.util.StandardCodes;
33 import org.unicode.cldr.util.StandardCodes.LstrField;
34 import org.unicode.cldr.util.StandardCodes.LstrType;
35 import org.unicode.cldr.util.SupplementalDataInfo;
36 import org.unicode.cldr.util.Validity;
37 import org.unicode.cldr.util.Validity.Status;
38 import org.unicode.cldr.util.XMLFileReader;
39 import org.unicode.cldr.util.XPathParts;
40 import org.unicode.cldr.util.XPathParts.Comments.CommentType;
41 
42 import com.google.common.base.Joiner;
43 import com.ibm.icu.impl.Relation;
44 import com.ibm.icu.impl.Row.R2;
45 import com.ibm.icu.impl.Utility;
46 import com.ibm.icu.lang.UCharacter;
47 import com.ibm.icu.text.CaseMap;
48 import com.ibm.icu.text.LocaleDisplayNames;
49 import com.ibm.icu.text.Normalizer2;
50 import com.ibm.icu.util.ULocale;
51 
52 public class SubdivisionNode {
53     private static final Comparator<String> COMPARATOR_ROOT = CLDRConfig.getInstance().getComparatorRoot();
54     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
55     static final Map<String, R2<List<String>, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory");
56     static final Set<String> containment = SDI.getContainers();
57     static final Map<String, Map<LstrField, String>> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region);
58 
59     static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH);
60 
61     static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase();
62     static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
63     static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish();
64     static final Normalizer2 nfc = Normalizer2.getNFCInstance();
65 
convertToCldr(String regionOrSubdivision)66     public static String convertToCldr(String regionOrSubdivision) {
67         return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT)
68             : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT);
69     }
70 
71     final SubdivisionSet sset;
72     final String code;
73     final int level;
74     final SubdivisionNode parent;
75     final Map<String, SubdivisionNode> children = new TreeMap<>(COMPARATOR_ROOT);
76 
SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)77     public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) {
78         this.code = code;
79         this.level = parent == null ? -1 : parent.level + 1;
80         this.parent = parent;
81         this.sset = sset;
82         sset.ID_TO_NODE.put(code, this);
83     }
84 
addName(String lang, String value)85     public SubdivisionNode addName(String lang, String value) {
86         sset.NAMES.put(code, lang, value);
87         return this;
88     }
89 
90     static class SubdivisionSet {
91 
92 		final M3<String, String, String> NAMES = ChainedMap.of(
93             new TreeMap<String, Object>(),
94             new TreeMap<String, Object>(),
95             String.class);
96         final Map<String, String> TO_COUNTRY_CODE = new TreeMap<>();
97         final Relation<String, String> ID_SAMPLE = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
98         final Map<String, String> SUB_TO_CAT = new TreeMap<>();
99         final Relation<String, String> REGION_CONTAINS = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
100         final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>();
101 
102         final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World");
103 
addName(String code, String lang, String value)104         public void addName(String code, String lang, String value) {
105             int parenPos = value.indexOf("(see also separate country");
106             if (parenPos >= 0) {
107                 /*
108                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire"
109                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba"
110                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius"
111                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard"
112                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen"
113                  */
114                 // OLD code to guess country from comment
115 //              String paren = value.substring(value.length() - 3, value.length() - 1);
116 //                if (!paren.equals("BQ") && !paren.equals("SJ")) {
117 //                    String old = TO_COUNTRY_CODE.get(code);
118 //                    if (old != null) {
119 //                        System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren);
120 //                    }
121 //                    TO_COUNTRY_CODE.put(code, paren);
122 //                }
123                 value = value.substring(0, parenPos).trim();
124             }
125             value = value.replace("*", "");
126             NAMES.put(code, lang, value);
127         }
128 
129 
130 
131 
132         static final String[] CRUFT = {
133             "Emirate",
134             "Parish",
135             "County",
136             "District",
137             "Region",
138             "Province of",
139             "Province",
140             "Republic",
141             ", Barbados",
142             ", Burkina Faso",
143             "Governorate",
144             "Department",
145             "Canton of",
146             "(Région des)",
147             "(Région du)",
148             "(Région de la)",
149             "Autonomous",
150             "Archipelago of",
151             "Canton",
152             "kanton",
153             ", Bahamas",
154             "province",
155             "(Région)",
156             "(Région de l')",
157             ", Cameroon",
158             "State of",
159             "State",
160             "Metropolitan Borough of",
161             "London Borough of",
162             "Royal Borough of",
163             "Borough of",
164             "Borough",
165             "Council of",
166             "Council",
167             "City of",
168             ", The",
169             "prefecture",
170             "Prefecture",
171             "municipality"
172         };
173 
174         static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b");
175         static final Pattern BRACKETED = PatternCache.get("\\[.*\\]");
176 
clean(String input)177         static String clean(String input) {
178             if (input == null) {
179                 return input;
180             }
181             // Quick & dirty
182             input = BRACKETED.matcher(input).replaceAll("");
183             input = CRUFT_PATTERN.matcher(input).replaceAll("");
184 //            for (String cruft : CRUFT) {
185 //                int pos = input.indexOf(cruft);
186 //                if (pos >= 0) {
187 //                    input = input.substring(0,pos) + input.substring(pos + cruft.length());
188 //                }
189 //            }
190             input = input.replace("  ", " ");
191             if (input.endsWith(",")) {
192                 input = input.substring(0, input.length() - 1);
193             }
194             return fixName(input);
195         }
196 
197 
198 
appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level)199         private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException {
200             if (name == null) {
201                 return;
202             }
203             String cldrCode = convertToCldr(sdCode);
204             String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]";
205             String oldValue = fileSubdivisions.getStringValue(path);
206             if (oldValue != null) {
207                 return; // don't override old values
208             }
209             fileSubdivisions.add(path, name);
210             if (level != null) {
211                 fileSubdivisions.addComment(path, level, CommentType.LINE);
212             }
213         }
214 
isKosher(String regionCode)215         private boolean isKosher(String regionCode) {
216             if (regionCode.equals("001")) {
217                 return false;
218             }
219             if (territoryAliases.containsKey(regionCode)
220                 || containment.contains(regionCode)
221                 || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) {
222                 Set<String> rc = REGION_CONTAINS.get(regionCode);
223                 if (rc != null) {
224                     throw new IllegalArgumentException("? " + regionCode + ": " + rc);
225                 }
226                 return false;
227             }
228             return true;
229         }
230 
addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)231         private static void addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) {
232             TreeMap<String, SubdivisionNode> temp = new TreeMap<>(COMPARATOR_ROOT);
233             temp.putAll(children2);
234             ordered.addAll(temp.values());
235             for (SubdivisionNode n : temp.values()) {
236                 if (!n.children.isEmpty()) {
237                     addChildren(ordered, n.children);
238                 }
239             }
240         }
241 
242         static Map<String, String> NAME_CORRECTIONS = new HashMap<>();
243 
getBestName(String value, boolean useIso)244         private String getBestName(String value, boolean useIso) {
245             String cldrName = null;
246             cldrName = NAME_CORRECTIONS.get(value);
247             if (cldrName != null) {
248                 return fixName(cldrName);
249             }
250             R2<List<String>, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value);
251             if (subdivisionAlias != null) {
252                 String country = subdivisionAlias.get0().get(0);
253                 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country);
254                 if (cldrName != null) {
255                     return fixName(cldrName);
256                 }
257             }
258 
259 
260             cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value);
261             if (cldrName != null) {
262                 return fixName(cldrName);
263             }
264 
265             Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value);
266             if (oldAliases != null) {
267                 for (String oldAlias : oldAliases) {
268                     cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias);
269                     if (cldrName != null) {
270                         return fixName(cldrName);
271                     }
272                 }
273             }
274 
275             if (useIso) {
276                 cldrName = getIsoName(value);
277                 if (cldrName == null) {
278                     cldrName = "UNKNOWN";
279                     //throw new IllegalArgumentException("Failed to find name: " + value);
280                 }
281                 return fixName(cldrName);
282             }
283             return null;
284         }
285 
fixName(String name)286         private static String fixName(String name) {
287             return name == null ? null : nfc.normalize(name.replace('\'', '’').replace("  ", " ").trim());
288         }
289 
SubdivisionSet(String sourceFile)290         public SubdivisionSet(String sourceFile) {
291 
292             //    <country id="AD" version="16">
293             //           <subdivision-code footnote="*">AD-02</subdivision-code>
294             //             <subdivision-locale lang3code="eng" xml:lang="en">
295             //                  <subdivision-locale-name>Otago</subdivision-locale-name>
296 
297             List<Pair<String, String>> pathValues = XMLFileReader.loadPathValues(
298                 sourceFile,
299                 new ArrayList<Pair<String, String>>(), false);
300             int maxIndent = 0;
301             SubdivisionNode lastNode = null;
302             String lastCode = null;
303             Set<String> conflictingTargetCountries = new HashSet<>();
304 
305             for (Pair<String, String> pair : pathValues) {
306                 String path = pair.getFirst();
307                 boolean code = path.contains("/subdivision-code");
308                 boolean name = path.contains("/subdivision-locale-name");
309                 boolean nameCat = path.contains("/category-name");
310                 boolean relatedCountry = path.contains("/subdivision-related-country");
311 
312                 //    <country id="AD" version="16">
313                 //       <category id="262">
314                 //  <category-name lang3code="fra" xml:lang="fr">paroisse</category-name>
315                 //  <category-name lang3code="eng" xml:lang="en">parish</category-name>
316                 // also languages in region...
317 
318                 // new XML from ISO, so we don't have to guess the country code:
319                 //            <subdivision-code footnote="*">NL-BQ1</subdivision-code>
320                 //            <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, SINT EUSTATIUS AND SABA</subdivision-related-country>
321 
322                 if (!code && !name && !nameCat && !relatedCountry) {
323                     continue;
324                 }
325                 XPathParts parts = XPathParts.getFrozenInstance(path);
326                 String value = pair.getSecond();
327                 if (relatedCountry) {
328                     String target = parts.getAttributeValue(-1, "country-id");
329                     // remove conflicting target countries
330                     for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
331                         if (entry.getValue().equals(target)) {
332                             conflictingTargetCountries.add(target);
333                             TO_COUNTRY_CODE.remove(entry.getKey(), target); // there can be at most one
334                             break;
335                         }
336                     }
337                     if (!conflictingTargetCountries.contains(target)) {
338                         TO_COUNTRY_CODE.put(lastCode, target);
339                         //System.out.println(lastCode + " => " + target);
340                     }
341                 } else if (name) {
342                     int elementNum = -2;
343                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
344                     if (lang == null) {
345                         lang = parts.getAttributeValue(elementNum, "lang3code");
346                     }
347                     addName(lastCode, lang, value);
348                     //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t");
349                 } else if (nameCat) {
350                     //country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"]
351                     int elementNum = -1;
352                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
353                     if (lang == null) {
354                         lang = parts.getAttributeValue(elementNum, "lang3code");
355                     }
356                     String category = parts.getAttributeValue(-2, "id");
357                     addName(category, lang, value);
358                     //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t");
359                 } else {
360                     int countSubdivision = 0;
361                     for (int i = 0; i < parts.size(); ++i) {
362                         if (parts.getElement(i).equals("subdivision")) {
363                             ++countSubdivision;
364                         }
365                     }
366                     if (maxIndent < countSubdivision) {
367                         maxIndent = countSubdivision;
368                     }
369                     value = convertToCldr(value);
370                     if (countSubdivision == 1) {
371                         lastNode = addNode(null, value);
372                     } else {
373                         lastNode = addNode(lastNode, value);
374                     }
375                     lastCode = value;
376                     int subdivisionElement = parts.findElement("subdivision");
377                     String id = parts.getAttributeValue(subdivisionElement, "category-id");
378                     addIdSample(id, value);
379                     //<subdivision category-id="262">//<subdivision-code footnote="*">AD-06</subdivision-code>
380                     // <subdivision category-id="262">
381                     //output.println(++count + Utility.repeat("\t", indent) + "code=" + value);
382                 }
383             }
384         }
385 
addIdSample(String id, String value)386         public void addIdSample(String id, String value) {
387             SUB_TO_CAT.put(value, id);
388             ID_SAMPLE.put(getIsoName(id), value);
389         }
390 
addNode(SubdivisionNode lastSubdivision, String subdivision)391         final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) {
392             // "NZ-S", x
393             String region = SubdivisionNames.getRegionFromSubdivision(subdivision);
394             REGION_CONTAINS.put(region, subdivision);
395             if (lastSubdivision == null) {
396                 lastSubdivision = BASE.children.get(region);
397                 if (lastSubdivision == null) {
398                     lastSubdivision = new SubdivisionNode(region, BASE, this).addName("en", ENGLISH_ICU.regionDisplayName(region));
399                     BASE.children.put(region, lastSubdivision);
400                 }
401                 return add(lastSubdivision, subdivision);
402             }
403             add(lastSubdivision, subdivision);
404             return lastSubdivision;
405         }
406 
add(SubdivisionNode subdivisionNode1, String subdivision2)407         private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) {
408             SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2);
409             if (subdivisionNode2 == null) {
410                 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this);
411             }
412             subdivisionNode1.children.put(subdivision2, subdivisionNode2);
413             return subdivisionNode2;
414         }
415 
getName(SubdivisionNode base2)416         private String getName(SubdivisionNode base2) {
417             return getIsoName(base2.code);
418         }
419 
getIsoName(String code)420         private String getIsoName(String code) {
421             if (code == null) {
422                 return null;
423             }
424             Map<String, String> map = NAMES.get(code);
425             if (map == null) {
426                 return "???";
427             }
428             String name = map.get("en");
429             if (name != null) {
430                 return name;
431             }
432             name = map.get("es");
433             if (name != null) {
434                 return name;
435             }
436             name = map.get("fr");
437             if (name != null) {
438                 return name;
439             }
440             if (name == null) {
441                 name = map.entrySet().iterator().next().getValue();
442             }
443             return name;
444         }
print(PrintWriter out)445         public void print(PrintWriter out) {
446             print(out, 0, "", BASE);
447             for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
448                 out.println(entry.getKey() + "\t" + entry.getValue());
449             }
450         }
print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)451         private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) {
452             if (!prefix.isEmpty()) {
453                 prefix += "\t";
454             }
455             prefix += base2.code;
456             final String indentString = Utility.repeat("\t", 4-indent);
457             out.println(prefix + indentString + getName(base2));
458             if (base2.children.isEmpty()) {
459                 return;
460             }
461             for (SubdivisionNode child : base2.children.values()) {
462                 print(out, indent + 1, prefix, child);
463             }
464         }
465     }
466 
467     static class SubDivisionExtractor {
468         final SubdivisionSet sdset;
469         final Validity validityFormer;
470         final Map<String, R2<List<String>, String>> subdivisionAliasesFormer;
471         final Relation<String, String> formerRegionToSubdivisions;
472 
SubDivisionExtractor(SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)473         public SubDivisionExtractor(SubdivisionSet sdset,
474             Validity validityFormer,
475             Map<String, R2<List<String>, String>> subdivisionAliasesFormer,
476             Relation<String, String> formerRegionToSubdivisions) {
477             this.sdset = sdset;
478             this.validityFormer = validityFormer;
479             this.subdivisionAliasesFormer = subdivisionAliasesFormer;
480             this.formerRegionToSubdivisions = formerRegionToSubdivisions;
481         }
482 
printXml(Appendable output)483         void printXml(Appendable output) throws IOException {
484 
485             /*
486             <subdivisionContainment>
487             <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand -->
488             <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand -->
489             <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island -->
490             <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island -->
491             </subdivisionContainment>
492              */
493             output.append(
494                 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass())
495                 + "\t<version number=\"$Revision" + "$\"/>\n"
496                 + "\t<subdivisionContainment>\n");
497             printXml(output, sdset.BASE, 0);
498             output.append("\t</subdivisionContainment>\n</supplementalData>\n");
499         }
500 
printAliases(Appendable output)501         void printAliases(Appendable output) throws IOException {
502             addAliases(output, sdset.TO_COUNTRY_CODE.keySet());
503 
504             // Get the old validity data
505             Map<Status, Set<String>> oldSubdivisionData = validityFormer.getStatusToCodes(LstrType.subdivision);
506             Set<String> missing = new TreeSet<>(COMPARATOR_ROOT);
507             missing.addAll(sdset.TO_COUNTRY_CODE.keySet());
508             Set<String> nowValid = sdset.ID_TO_NODE.keySet();
509             for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) {
510                 Status v = e.getKey();
511                 if (v == Status.unknown) {
512                     continue;
513                 }
514                 Set<String> set = e.getValue();
515                 for (String sdcodeRaw : set) {
516                     String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT);
517 //                  sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2);
518                     if (!nowValid.contains(sdcode)) {
519                         missing.add(sdcode);
520                     }
521                 }
522             }
523             missing.removeAll(sdset.TO_COUNTRY_CODE.keySet());
524             addAliases(output, missing);
525         }
526 
addAliases(Appendable output, Set<String> missing)527         private void addAliases(Appendable output, Set<String> missing) throws IOException {
528             for (String toReplace : missing) {
529                 List<String> replaceBy = null;
530                 String reason = "deprecated";
531                 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace);
532                 if (aliasInfo != null) {
533                     replaceBy = aliasInfo.get0();
534                     reason = aliasInfo.get1();
535                     System.out.println("Adding former alias: " + toReplace + " => " + replaceBy);
536                 } else {
537                     String replacement = sdset.TO_COUNTRY_CODE.get(toReplace);
538                     if (replacement != null) {
539                         replaceBy = Collections.singletonList(replacement);
540                         reason = "overlong";
541                         System.out.println("Adding country code alias: " + toReplace + " => " + replaceBy);
542                     }
543                 }
544                 addAlias(output, toReplace, replaceBy, reason);
545             }
546         }
547 
addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason)548         private void addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason) throws IOException {
549             // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban -->
550             output.append("\t\t\t");
551             if (replaceBy == null) {
552                 output.append("<!-- ");
553             }
554             output.append("<subdivisionAlias"
555                 + " type=\"" + toReplace + "\""
556                 + " replacement=\"" + (replaceBy == null ? toReplace.substring(0, 2) + "?" :
557                 Joiner.on(" ").join(replaceBy)) + "\""
558                 + " reason=\"" + reason + "\"/>"
559                 + (replaceBy == null ? " <!- - " : " <!-- ")
560                 + sdset.getBestName(toReplace, true) + " => " + (replaceBy == null ? "??" : getBestName(replaceBy, true)) + " -->"
561                 + "\n");
562         }
563 
getBestName(List<String> replaceBy, boolean useIso)564         private String getBestName(List<String> replaceBy, boolean useIso) {
565             StringBuilder result = new StringBuilder();
566             for (String s : replaceBy) {
567                 if (result.length() != 0) {
568                     result.append(", ");
569                 }
570                 if (SubdivisionNames.isRegionCode(s)) {
571                     result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s));
572                 } else {
573                     result.append(sdset.getBestName(s, useIso));
574                 }
575             }
576             return result.toString();
577         }
578 
printXml(Appendable output, SubdivisionNode base2, int indent)579         private void printXml(Appendable output, SubdivisionNode base2, int indent) throws IOException {
580             if (base2.children.isEmpty()) {
581                 return;
582             }
583             String type = base2.code;
584             if (base2 != sdset.BASE) {
585                 type = convertToCldr(type);
586                 output.append("\t\t" + "<subgroup"
587                     + " type=\"" + type + "\""
588                     + " contains=\"");
589                 boolean first = true;
590                 for (String child : base2.children.keySet()) {
591                     if (first) {
592                         first = false;
593                     } else {
594                         output.append(' ');
595                     }
596                     String subregion = convertToCldr(child);
597                     output.append(subregion);
598                 }
599                 output.append("\"/>\n");
600             }
601             for (SubdivisionNode child : base2.children.values()) {
602                 printXml(output, child, indent);
603             }
604         }
605 
printSamples(Appendable pw)606         public void printSamples(Appendable pw) throws IOException {
607             Set<String> seen = new HashSet<>();
608             for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) {
609                 pw.append(entry.getKey());
610                 //int max = 10;
611                 seen.clear();
612                 for (String sample : entry.getValue()) {
613                     String region = sample.substring(0, 2);
614                     if (seen.contains(region)) {
615                         continue;
616                     }
617                     seen.add(region);
618                     pw.append(";\t" + ENGLISH_ICU.regionDisplayName(region) + ": " + sdset.getIsoName(sample)
619                     + " (" + sample + ")");
620                     //if (--max < 0) break;
621                 }
622                 pw.append(System.lineSeparator());
623             }
624         }
625 
printEnglishComp(Appendable output)626         public void printEnglishComp(Appendable output) throws IOException {
627             Set<String> countEqual = new TreeSet<>();
628             String lastCC = null;
629             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n");
630             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
631                 final String countryCode = entry.getKey();
632                 if (!countryCode.equals(lastCC)) {
633                     if (lastCC != null && countEqual.size() != 0) {
634                         output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n");
635                     }
636                     countEqual.clear();
637 
638                     lastCC = countryCode;
639                 }
640                 for (String value : entry.getValue()) {
641                     String cldrName = sdset.getBestName(value, false);
642                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
643                     final String iso = sdset.getIsoName(value);
644                     if (iso.equals(wiki)) {
645                         countEqual.add(iso);
646                         continue;
647                     }
648                     output.append(
649                         ENGLISH_ICU.regionDisplayName(countryCode)
650 //                        + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
651                         + "\t" + cldrName
652                         + "\t" + value
653                         + "\t" + iso
654                         + "\t" + wiki
655                         + "\n");
656                 }
657             }
658             if (countEqual.size() != 0) {
659                 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n");
660             }
661         }
662 
printEnglishCompFull(Appendable output)663         public void printEnglishCompFull(Appendable output) throws IOException {
664             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n");
665             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
666                 final String countryCode = entry.getKey();
667                 for (String value : entry.getValue()) {
668                     String cldrName = sdset.getBestName(value, false);
669                     //getBestName(value);
670                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
671                     final String iso = sdset.getIsoName(value);
672                     output.append(
673                         ENGLISH_ICU.regionDisplayName(countryCode)
674 //                        + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
675                         + "\t" + value
676                         + "\t" + cldrName
677                         + "\t" + iso
678                         + "\t" + wiki
679                         + "\n");
680                 }
681             }
682         }
683 
printEnglish(PrintWriter output)684         public void printEnglish(PrintWriter output) throws IOException {
685             TreeSet<String> allRegions = new TreeSet<>();
686             allRegions.addAll(codeToData.keySet());
687             allRegions.addAll(formerRegionToSubdivisions.keySet()); // override
688 
689             Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
690             CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false);
691             CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
692 
693             Set<String> skipped = new LinkedHashSet<>();
694 
695             for (String regionCode : allRegions) {
696                 if (!sdset.isKosher(regionCode)) {
697                     if (regionCode.length() != 3) {
698                         skipped.add(regionCode);
699                     }
700                     continue;
701                 }
702                 Set<String> remainder = formerRegionToSubdivisions.get(regionCode);
703                 remainder = remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder);
704 
705                 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode);
706                 if (regionNode == null) {
707                     continue;
708                 }
709 
710                 Set<SubdivisionNode> ordered = new LinkedHashSet<>();
711                 SubdivisionSet.addChildren(ordered, regionNode.children);
712 
713                 for (SubdivisionNode node : ordered) {
714                     final String sdCode = node.code;
715                     String name = sdset.getBestName(sdCode, true);
716                     String upper = UCharacter.toUpperCase(name);
717                     String title = SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(Locale.ROOT, null, name);
718                     if (name.equals(upper) || !name.equals(title)) {
719                         System.out.println("Suspicious name: " + name);
720                     }
721                     SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null);
722                     remainder.remove(sdCode);
723                 }
724                 for (String sdCode : remainder) {
725                     String name = sdset.getBestName(sdCode, true);
726                     if (!name.equals("???")) {
727                         SubdivisionSet.appendName(fileSubdivisions, sdCode, name, "\t<!-- deprecated -->");
728                     }
729                 }
730             }
731             System.out.println("Skipping: " + skipped);
732             fileSubdivisions.write(output);
733         }
734 
printMissingMIDs(PrintWriter pw)735         public void printMissingMIDs(PrintWriter pw) {
736 //          for (Entry<String, String> entry : WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) {
737 //              String mid = entry.getValue();
738 //              if (!mid.isEmpty()) {
739 //                  continue;
740 //              }
741 //              String subCode = entry.getKey();
742 //              String wiki = clean(getWikiName(subCode));
743 //              String iso = clean(getIsoName(subCode));
744 //              String countryCode = subCode.substring(0, 2);
745 //              String cat = SUB_TO_CAT.get(subCode);
746 //              String catName = getIsoName(cat);
747 //              pw.append(
748 //                  ENGLISH_ICU.regionDisplayName(countryCode)
749 //                  + "\t" + mid
750 //                  + "\t" + subCode
751 //                  + "\t" + catName
752 //                  + "\t" + wiki
753 //                  + "\t" + iso
754 //                  + "\n"
755 //                  );
756 //          }
757         }
758     }
759 }