• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.util;
2 
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.util.Arrays;
7 import java.util.Collection;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.Iterator;
11 import java.util.List;
12 import java.util.Locale;
13 import java.util.Map;
14 import java.util.Map.Entry;
15 import java.util.Set;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18 
19 import org.unicode.cldr.draft.FileUtilities;
20 import org.unicode.cldr.test.DisplayAndInputProcessor;
21 import org.unicode.cldr.tool.SubdivisionNode;
22 import org.unicode.cldr.util.CLDRFile.NumberingSystem;
23 import org.unicode.cldr.util.CLDRFile.WinningChoice;
24 import org.unicode.cldr.util.ChainedMap.M4;
25 import org.unicode.cldr.util.StandardCodes.LstrType;
26 import org.unicode.cldr.util.Validity.Status;
27 
28 import com.google.common.base.Splitter;
29 import com.google.common.collect.LinkedHashMultimap;
30 import com.google.common.collect.Multimap;
31 import com.google.common.collect.TreeMultimap;
32 import com.ibm.icu.impl.Row.R2;
33 import com.ibm.icu.impl.Row.R3;
34 import com.ibm.icu.impl.Row.R4;
35 import com.ibm.icu.impl.Utility;
36 import com.ibm.icu.lang.UProperty;
37 import com.ibm.icu.lang.UScript;
38 import com.ibm.icu.text.Normalizer2;
39 import com.ibm.icu.text.UTF16;
40 import com.ibm.icu.text.UnicodeSet;
41 import com.ibm.icu.util.ICUUncheckedIOException;
42 import com.ibm.icu.util.ULocale;
43 
44 public final class WikiSubdivisionLanguages {
45     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
46     static final Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular);
47 
48     static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = SDI.getLocaleAliasInfo().get("subdivision");
49 
50     private static final boolean DEBUG_CONSOLE = false;
51     private static final String DEBUG_LANG_FILTER = null; // "az";
52 
53     private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"";
54 
55     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
56     private static final Normalizer2 NFC = Normalizer2.getNFCInstance();
57 
58     enum Items {
59         // http://www.wikidata.org/entity/Q24260    كانيلو  AD-02   ar
60         wid, translation, subdivisionId, languageId
61     }
62 
63     private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(),
64         String.class);
65     private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(),
66         String.class);
67     private static Set<String> bogus = new TreeSet<>();
68     private static Multimap<Status, String> bogusStatus = TreeMultimap.create();
69 
getSubdivisionName(String subdivisionId, String languageId)70     public static String getSubdivisionName(String subdivisionId, String languageId) {
71         return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId);
72     }
73 
getBestWikiEnglishName(String subdivisionId)74     public static String getBestWikiEnglishName(String subdivisionId) {
75         String languageId = "en";
76         String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId);
77         if (name != null) {
78             return name;
79         }
80         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es");
81         if (name != null) {
82             return name;
83         }
84         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr");
85         if (name != null) {
86             return name;
87         }
88         Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId);
89         // try Spanish, then French, then first other
90         if (data != null) {
91             return data.entrySet().iterator().next().getValue(); // get first
92         }
93         return null;
94     }
95 
96     //static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>();
97     static {
98         Splitter TAB = Splitter.on('\t').trimResults();
99         File file = new File("data/external", "wikiSubdivisionLanguages.tsv");
100         try {
file.getCanonicalFile()101             System.out.println(file.getCanonicalFile());
102         } catch (IOException e) {
103             e.printStackTrace();
104         }
105         Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision);
106 
107         for (String line : FileUtilities.in(WikiSubdivisionLanguages.class, "data/external/wikiSubdivisionLanguages.tsv")) {
108 
109             List<String> data = TAB.splitToList(line);
110             String subdivision = SubdivisionNode.convertToCldr(data.get(Items.subdivisionId.ordinal()));
111             if (!regularSubdivisions.contains(subdivision)) {
112                 Status status = codeToStatus.get(subdivision);
113                 if (status == null) {
114                     bogus.add(subdivision);
115                 } else {
bogusStatus.put(status, subdivision)116                     bogusStatus.put(status, subdivision);
117                 }
118                 continue;
119             }
120             String lang = data.get(Items.languageId.ordinal());
121             if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(lang)) {
122                 continue;
123             }
124             String name = NFC.normalize(data.get(Items.translation.ordinal()));
SUB_LANG_NAME.put(subdivision, lang, name)125             SUB_LANG_NAME.put(subdivision, lang, name);
126 //                WIKIDATA_TO_MID.put(subdivision, data.get(2));
LANG_SUB_NAME.put(lang, subdivision, name)127             LANG_SUB_NAME.put(lang, subdivision, name);
128         }
129         // postprocess
130         String oldLang = null;
131         DisplayAndInputProcessor daip = null;
132         Exception[] internalException = { null };
133 
134         for (R3<String, String, String> row : LANG_SUB_NAME.rows()) {
135             String lang = row.get0();
136             String subdivision = row.get1();
137             String name = row.get2();
138             if (!lang.equals(oldLang)) {
139                 oldLang = lang;
140                 daip = new DisplayAndInputProcessor(new ULocale(lang));
141             }
142             String path = getSubdivisionPath(subdivision);
143             String name2 = daip.processInput(
144                 path,
145                 name.replace("\u00AD", ""),
146                 internalException);
147             if (name2.contains("'")) {
148                 int debug = 0;
149             }
150             // TODO remove soft hyphen in DAIP
151             if (internalException[0] != null) {
152                 throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]);
153             } else if (!name.equals(name2)) {
154                 //System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2);
SUB_LANG_NAME.put(subdivision, lang, name2)155                 SUB_LANG_NAME.put(subdivision, lang, name2);
LANG_SUB_NAME.put(lang, subdivision, name2)156                 LANG_SUB_NAME.put(lang, subdivision, name2);
157             }
158         }
159 
160     }
161 
getSubdivisionPath(String subdivision)162     private static String getSubdivisionPath(String subdivision) {
163         return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]";
164     }
165 
getSubdivisionFromPath(String path)166     private static String getSubdivisionFromPath(String path) {
167         return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length()));
168     }
169 
main(String[] args)170     public static void main(String[] args) {
171         Counter<String> counter = new Counter<>();
172         Factory cldrFactory = CLDR_CONFIG.getCldrFactory();
173         Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
174         CLDRFile file = null;
175         UnicodeSet exemplars = null;
176 
177         ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of(
178             new TreeMap<Integer, Object>(),
179             new TreeMap<String, Object>(),
180             new TreeMap<String, Object>(),
181             String.class);
182 
183         for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) {
184             String lang = entry.getKey();
185             file = cldrFactory.make(lang, true);
186 
187             CLDRFile oldFileSubdivisions;
188             try {
189                 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false);
190             } catch (Exception e) {
191                 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze();
192             }
193 
194             Multimap<String, String> inverse = LinkedHashMultimap.create();
195             CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse);
196 
197             UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0);
198             UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING);
199             UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING);
200             UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem);
201             exemplars = new UnicodeSet()
202                 .addAll(main)
203                 .addAll(auxiliary)
204                 .addAll(scriptsFor(main)) // broad test,...
205                 .addAll(punctuation)
206                 .addAll(numbers)
207                 .addAll(new UnicodeSet("[\\ ]")).freeze();
208 
209             for (Entry<String, String> entry2 : entry.getValue().entrySet()) {
210                 String subdivision = entry2.getKey();
211                 String name = entry2.getValue();
212                 if (name.equals("Böyük Britaniya")) {
213                     int debug = 0;
214                 }
215                 String path = getSubdivisionPath(subdivision);
216                 String oldName = fileSubdivisions.getStringValue(path);
217                 if (oldName != null) {
218                     if (!oldName.equals(name)) {
219                         //System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName);
220                     }
221                     continue;
222                 }
223                 if (!exemplars.containsAll(name)) {
224                     UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars);
225                     addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name);
226                     continue;
227                 }
228                 fileSubdivisions.add(path, name);
229                 inverse.put(name, path);
230                 counter.add(lang, 1);
231             }
232 
233             // We now fix collisions
234             for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) {
235                 String name = entry3.getKey();
236                 if (name.isEmpty()) {
237                     continue;
238                 }
239                 if (name.equals("Böyük Britaniya")) {
240                     int debug = 0;
241                 }
242                 Collection<String> paths = entry3.getValue();
243                 if (paths.size() <= 1) {
244                     continue;
245                 }
246                 if (paths.size() > 3) {
247                     int debug = 0;
248                 }
249                 // we only care about collisions *within* a region.
250                 // so group them together
251                 Multimap<String, String> regionToPaths = LinkedHashMultimap.create();
252                 for (String path : paths) {
253                     String sdId = getSubdivisionFromPath(path);
254                     String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT);
255                     regionToPaths.put(region, path);
256                 }
257 
258                 // Now fix as necessary
259                 for (Entry<String, Collection<String>> regionAndPaths : regionToPaths.asMap().entrySet()) {
260                     Collection<String> paths2 = regionAndPaths.getValue();
261                     int markerIndex = 0;
262                     if (paths2.size() <= 1) {
263                         continue;
264                     }
265 
266                     // find if any of the paths are deprecated
267                     for (Iterator<String> it = paths2.iterator(); it.hasNext();) {
268                         String path = it.next();
269                         String sdId = getSubdivisionFromPath(path);
270                         if (!regularSubdivisions.contains(sdId)) { // deprecated
271                             fileSubdivisions.remove(path);
272                             it.remove();
273                             fail("Duplicate, not regular ", lang, getSubdivisionFromPath(path), "REMOVING", -1);
274                         }
275                     }
276                     if (paths2.size() <= 1) {
277                         continue;
278                     }
279 
280                     String otherId = null;
281                     for (String path : paths2) {
282 //                    if (nuke) {
283 //                        if (oldFileSubdivisions.getStringValue(path) == null) {
284 //                            fileSubdivisions.remove(path); // get rid of new ones
285 //                            System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name);
286 //                        }
287                         if (markerIndex == 0) {
288                             otherId = getSubdivisionFromPath(path);
289                         } else {
290                             String fixedName = name + MARKERS.get(markerIndex);
291                             fail("Superscripting ", lang + "\t(" + otherId +")", getSubdivisionFromPath(path), fixedName, -1);
292                             //System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName);
293                             fileSubdivisions.add(path, fixedName); // overwrite with superscripted
294                         }
295                         ++markerIndex;
296                     }
297                 }
298             }
299 
300             if (DEBUG_CONSOLE) {
301                 PrintWriter pw = new PrintWriter(System.out);
302                 fileSubdivisions.write(new PrintWriter(System.out));
303                 pw.flush();
304             } else {
305                 try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) {
306                     fileSubdivisions.write(out);
307                 } catch (Exception e) {
308                     throw new ICUUncheckedIOException(e);
309                 }
310             }
311         }
312         fail("ExemplarFailures", exemplarFailureLangSubdivisionName);
313 
314         for (String lang : counter.getKeysetSortedByKey()) {
315             fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1);
316         }
317         System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus);
318         for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) {
319             System.out.println("SubdivisionId:\t\t"
320                 + ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue());
321         }
322     }
323 
fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)324     private static CLDRFile fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) {
325         CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
326 
327         // for fixing collisions
328         // we first add existing items
329         Set<String> toRemove = new HashSet<>();
330         Map<String,String> toAdd = new HashMap<>();
331 
332         for (String path : fileSubdivisions) {
333             XPathParts parts = XPathParts.getFrozenInstance(path);
334             if (!"subdivision".equals(parts.getElement(-1))) {
335                 continue;
336             }
337             String name = fileSubdivisions.getStringValue(path);
338             if (name.equals("Böyük Britaniya")) {
339                 int debug = 0;
340             }
341             // handle aliases also
342             String type = parts.getAttributeValue(-1, "type");
343             R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type);
344             if (replacement != null) {
345                 String fullPath = oldFileSubdivisions.getFullXPath(path);
346                 XPathParts parts2 = XPathParts.getInstance(fullPath);
347                 for (String replacementType : replacement.get0()) {
348                     parts2.setAttribute(-1, "type", replacementType);
349                     toRemove.add(path);
350                     path = parts2.toString();
351                     toAdd.put(path, name);
352                     System.out.println("Adding alias: " + replacementType + "«" + name + "»");
353                     break;
354                 }
355             }
356             inverse.put(name, path);
357         }
358         fileSubdivisions.removeAll(toRemove, false);
359         for (Entry<String, String> entry2 : toAdd.entrySet()) {
360             fileSubdivisions.add(entry2.getKey(), entry2.getValue());
361         }
362         return fileSubdivisions;
363     }
364 
addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)365     private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures,
366         String language, String subdivision, String name) {
367         for (String s : exemplarFailures) {
368             exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name);
369         }
370     }
371 
fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)372     private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) {
373         for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) {
374             fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0());
375         }
376     }
377 
fail(String title, String lang, String subdivision, String name, int exemplarFailure)378     private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) {
379         System.out.println(title
380             + ":\t" + lang
381             + "\t" + subdivision
382             + "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»")
383             + "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure))
384             + "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "")
385             + "\t" + CldrUtility.ifNull(name, "").replace("\"", "&quot;"));
386     }
387 
388     static final List<String> MARKERS = Arrays.asList("¹", "²", "³"); // if there are more than 3 of the same kind, throw exception
389 
scriptsFor(UnicodeSet main)390     private static UnicodeSet scriptsFor(UnicodeSet main) {
391         UnicodeSet result = UnicodeSet.EMPTY;
392         for (String s : main) {
393             int scriptCode = UScript.getScript(s.codePointAt(0));
394             if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) {
395                 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode);
396                 if (scriptCode == UScript.LATIN) {
397                     result.addAll("ʻ’&");
398                 }
399                 break;
400             }
401         }
402         return result;
403     }
404 }