• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.util.Arrays;
7 import java.util.Collection;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.Iterator;
11 import java.util.List;
12 import java.util.Locale;
13 import java.util.Map;
14 import java.util.Map.Entry;
15 import java.util.Set;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18 
19 import org.apache.jena.query.QuerySolution;
20 import org.apache.jena.query.ResultSet;
21 import org.unicode.cldr.draft.FileUtilities;
22 import org.unicode.cldr.rdf.QueryClient;
23 import org.unicode.cldr.rdf.TsvWriter;
24 import org.unicode.cldr.test.DisplayAndInputProcessor;
25 import org.unicode.cldr.tool.SubdivisionNode;
26 import org.unicode.cldr.util.CLDRConfig;
27 import org.unicode.cldr.util.CLDRFile;
28 import org.unicode.cldr.util.CLDRPaths;
29 import org.unicode.cldr.util.ChainedMap;
30 import org.unicode.cldr.util.CldrUtility;
31 import org.unicode.cldr.util.Counter;
32 import org.unicode.cldr.util.Factory;
33 import org.unicode.cldr.util.SimpleXMLSource;
34 import org.unicode.cldr.util.SupplementalDataInfo;
35 import org.unicode.cldr.util.Validity;
36 import org.unicode.cldr.util.XPathParts;
37 import org.unicode.cldr.util.CLDRFile.NumberingSystem;
38 import org.unicode.cldr.util.CLDRFile.WinningChoice;
39 import org.unicode.cldr.util.ChainedMap.M3;
40 import org.unicode.cldr.util.ChainedMap.M4;
41 import org.unicode.cldr.util.StandardCodes.LstrType;
42 import org.unicode.cldr.util.Validity.Status;
43 
44 import com.google.common.base.Splitter;
45 import com.google.common.collect.LinkedHashMultimap;
46 import com.google.common.collect.Multimap;
47 import com.google.common.collect.TreeMultimap;
48 import com.ibm.icu.impl.Row.R2;
49 import com.ibm.icu.impl.Row.R3;
50 import com.ibm.icu.impl.Row.R4;
51 import com.ibm.icu.impl.Utility;
52 import com.ibm.icu.lang.UProperty;
53 import com.ibm.icu.lang.UScript;
54 import com.ibm.icu.text.Normalizer2;
55 import com.ibm.icu.text.UTF16;
56 import com.ibm.icu.text.UnicodeSet;
57 import com.ibm.icu.util.ICUUncheckedIOException;
58 import com.ibm.icu.util.ULocale;
59 
60 public final class WikiSubdivisionLanguages {
61     private static final String WIKI_SUBDIVISION_LANGUAGES_TSV = "wikiSubdivisionLanguages.tsv";
62     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
63     static final Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular);
64 
65     static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = SDI.getLocaleAliasInfo().get("subdivision");
66 
67     private static final boolean DEBUG_CONSOLE = false;
68     private static final String DEBUG_LANG_FILTER = null; // "az";
69 
70     private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"";
71 
72     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
73     private static final Normalizer2 NFC = Normalizer2.getNFCInstance();
74 
75     private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(),
76         String.class);
77     private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(),
78         String.class);
79     private static Set<String> bogus = new TreeSet<>();
80     private static Multimap<Status, String> bogusStatus = TreeMultimap.create();
81 
getSubdivisionName(String subdivisionId, String languageId)82     public static String getSubdivisionName(String subdivisionId, String languageId) {
83         return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId);
84     }
85 
getBestWikiEnglishName(String subdivisionId)86     public static String getBestWikiEnglishName(String subdivisionId) {
87         String languageId = "en";
88         String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId);
89         if (name != null) {
90             return name;
91         }
92         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es");
93         if (name != null) {
94             return name;
95         }
96         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr");
97         if (name != null) {
98             return name;
99         }
100         Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId);
101         // try Spanish, then French, then first other
102         if (data != null) {
103             return data.entrySet().iterator().next().getValue(); // get first
104         }
105         return null;
106     }
107 
108     private static final String QUERY_NAME = "wikidata-wikisubdivisionLanguages";
109 
110     //static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>();
init()111     static void init() throws IOException {
112 
113         QueryClient queryClient = QueryClient.getInstance();
114 
115         System.out.println("QUERY: " + QUERY_NAME);
116         ResultSet rs = queryClient.execSelectFromSparql(QUERY_NAME, QueryClient.WIKIDATA_SPARQL_SERVER);
117 
118         Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision);
119         try(PrintWriter tsv = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), WIKI_SUBDIVISION_LANGUAGES_TSV)) {
120             TsvWriter.writeRow(tsv, "item", "label", "code", "codeLabel");
121             for (;rs.hasNext();) {
122                 final QuerySolution qs = rs.next();
123 
124                 String item = QueryClient.getResourceOrNull(qs, "item");
125                 String label = NFC.normalize(QueryClient.getStringOrNull(qs, "label"));
126                 String code = QueryClient.getStringOrNull(qs,  "code");
127                 String codeLabel = QueryClient.getStringOrNull(qs, "codeLabel");
128 
129                 TsvWriter.writeRow(tsv, item, label, code, codeLabel);
130 
131                 String subdivision = SubdivisionNode.convertToCldr(code);
132                 if (!regularSubdivisions.contains(subdivision)) {
133                     Status status = codeToStatus.get(subdivision);
134                     if (status == null) {
135                         bogus.add(subdivision);
136                     } else {
137                         bogusStatus.put(status, subdivision);
138                     }
139                     continue;
140                 }
141                 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(codeLabel)) {
142                     continue;
143                 }
144                 SUB_LANG_NAME.put(subdivision, codeLabel, label);
145     //                WIKIDATA_TO_MID.put(subdivision, data.get(2));
146                 LANG_SUB_NAME.put(codeLabel, subdivision, label);
147             }
148             System.out.println("Queried " + QUERY_NAME + " at row count " + rs.getRowNumber());
149         }
150         System.out.println("Wrote to " + WIKI_SUBDIVISION_LANGUAGES_TSV);
151         // postprocess
152         String oldLang = null;
153         DisplayAndInputProcessor daip = null;
154         Exception[] internalException = { null };
155 
156         for (R3<String, String, String> row : LANG_SUB_NAME.rows()) {
157             String lang = row.get0();
158             String subdivision = row.get1();
159             String name = row.get2();
160             if (!lang.equals(oldLang)) {
161                 oldLang = lang;
162                 daip = new DisplayAndInputProcessor(new ULocale(lang));
163             }
164             String path = getSubdivisionPath(subdivision);
165             String name2 = daip.processInput(
166                 path,
167                 name.replace("\u00AD", ""),
168                 internalException);
169             if (name2.contains("'")) {
170                 int debug = 0;
171             }
172             // TODO remove soft hyphen in DAIP
173             if (internalException[0] != null) {
174                 throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]);
175             } else if (!name.equals(name2)) {
176                 //System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2);
177                 SUB_LANG_NAME.put(subdivision, lang, name2);
178                 LANG_SUB_NAME.put(lang, subdivision, name2);
179             }
180         }
181 
182     }
183 
getSubdivisionPath(String subdivision)184     private static String getSubdivisionPath(String subdivision) {
185         return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]";
186     }
187 
getSubdivisionFromPath(String path)188     private static String getSubdivisionFromPath(String path) {
189         return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length()));
190     }
191 
main(String[] args)192     public static void main(String[] args) throws IOException {
193         init();
194 
195         Counter<String> counter = new Counter<>();
196         Factory cldrFactory = CLDR_CONFIG.getCldrFactory();
197         Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
198         CLDRFile file = null;
199         UnicodeSet exemplars = null;
200 
201         ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of(
202             new TreeMap<Integer, Object>(),
203             new TreeMap<String, Object>(),
204             new TreeMap<String, Object>(),
205             String.class);
206 
207         for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) {
208             String lang = entry.getKey();
209             file = cldrFactory.make(lang, true);
210 
211             CLDRFile oldFileSubdivisions;
212             try {
213                 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false);
214             } catch (Exception e) {
215                 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze();
216             }
217 
218             Multimap<String, String> inverse = LinkedHashMultimap.create();
219             CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse);
220 
221             UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0);
222             UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING);
223             UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING);
224             UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem);
225             exemplars = new UnicodeSet()
226                 .addAll(main)
227                 .addAll(auxiliary)
228                 .addAll(scriptsFor(main)) // broad test,...
229                 .addAll(punctuation)
230                 .addAll(numbers)
231                 .addAll(new UnicodeSet("[\\ ]")).freeze();
232 
233             for (Entry<String, String> entry2 : entry.getValue().entrySet()) {
234                 String subdivision = entry2.getKey();
235                 String name = entry2.getValue();
236                 if (name.equals("Böyük Britaniya")) {
237                     int debug = 0;
238                 }
239                 String path = getSubdivisionPath(subdivision);
240                 String oldName = fileSubdivisions.getStringValue(path);
241                 if (oldName != null) {
242                     if (!oldName.equals(name)) {
243                         //System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName);
244                     }
245                     continue;
246                 }
247                 if (!exemplars.containsAll(name)) {
248                     UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars);
249                     addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name);
250                     continue;
251                 }
252                 fileSubdivisions.add(path, name);
253                 inverse.put(name, path);
254                 counter.add(lang, 1);
255             }
256 
257             // We now fix collisions
258             for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) {
259                 String name = entry3.getKey();
260                 if (name.isEmpty()) {
261                     continue;
262                 }
263                 if (name.equals("Böyük Britaniya")) {
264                     int debug = 0;
265                 }
266                 Collection<String> paths = entry3.getValue();
267                 if (paths.size() <= 1) {
268                     continue;
269                 }
270                 if (paths.size() > 3) {
271                     int debug = 0;
272                 }
273                 // we only care about collisions *within* a region.
274                 // so group them together
275                 Multimap<String, String> regionToPaths = LinkedHashMultimap.create();
276                 for (String path : paths) {
277                     String sdId = getSubdivisionFromPath(path);
278                     String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT);
279                     regionToPaths.put(region, path);
280                 }
281 
282                 // Now fix as necessary
283                 for (Entry<String, Collection<String>> regionAndPaths : regionToPaths.asMap().entrySet()) {
284                     Collection<String> paths2 = regionAndPaths.getValue();
285                     int markerIndex = 0;
286                     if (paths2.size() <= 1) {
287                         continue;
288                     }
289 
290                     // find if any of the paths are deprecated
291                     for (Iterator<String> it = paths2.iterator(); it.hasNext();) {
292                         String path = it.next();
293                         String sdId = getSubdivisionFromPath(path);
294                         if (!regularSubdivisions.contains(sdId)) { // deprecated
295                             fileSubdivisions.remove(path);
296                             it.remove();
297                             fail("Duplicate, not regular ", lang, getSubdivisionFromPath(path), "REMOVING", -1);
298                         }
299                     }
300                     if (paths2.size() <= 1) {
301                         continue;
302                     }
303 
304                     String otherId = null;
305                     for (String path : paths2) {
306 //                    if (nuke) {
307 //                        if (oldFileSubdivisions.getStringValue(path) == null) {
308 //                            fileSubdivisions.remove(path); // get rid of new ones
309 //                            System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name);
310 //                        }
311                         if (markerIndex == 0) {
312                             otherId = getSubdivisionFromPath(path);
313                         } else {
314                             String fixedName = name + MARKERS.get(markerIndex);
315                             fail("Superscripting ", lang + "\t(" + otherId +")", getSubdivisionFromPath(path), fixedName, -1);
316                             //System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName);
317                             fileSubdivisions.add(path, fixedName); // overwrite with superscripted
318                         }
319                         ++markerIndex;
320                     }
321                 }
322             }
323 
324             if (DEBUG_CONSOLE) {
325                 PrintWriter pw = new PrintWriter(System.out);
326                 fileSubdivisions.write(new PrintWriter(System.out));
327                 pw.flush();
328             } else {
329                 try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) {
330                     fileSubdivisions.write(out);
331                 } catch (Exception e) {
332                     throw new ICUUncheckedIOException(e);
333                 }
334             }
335         }
336         fail("ExemplarFailures", exemplarFailureLangSubdivisionName);
337 
338         for (String lang : counter.getKeysetSortedByKey()) {
339             fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1);
340         }
341         System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus);
342         for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) {
343             System.out.println("SubdivisionId:\t\t"
344                 + ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue());
345         }
346     }
347 
fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)348     private static CLDRFile fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) {
349         CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
350 
351         // for fixing collisions
352         // we first add existing items
353         Set<String> toRemove = new HashSet<>();
354         Map<String,String> toAdd = new HashMap<>();
355 
356         for (String path : fileSubdivisions) {
357             XPathParts parts = XPathParts.getFrozenInstance(path);
358             if (!"subdivision".equals(parts.getElement(-1))) {
359                 continue;
360             }
361             String name = fileSubdivisions.getStringValue(path);
362             if (name.equals("Böyük Britaniya")) {
363                 int debug = 0;
364             }
365             // handle aliases also
366             String type = parts.getAttributeValue(-1, "type");
367             R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type);
368             if (replacement != null) {
369                 String fullPath = oldFileSubdivisions.getFullXPath(path);
370                 XPathParts parts2 = XPathParts.getFrozenInstance(fullPath).cloneAsThawed();
371                 for (String replacementType : replacement.get0()) {
372                     parts2.setAttribute(-1, "type", replacementType);
373                     toRemove.add(path);
374                     path = parts2.toString();
375                     toAdd.put(path, name);
376                     System.out.println("Adding alias: " + replacementType + "«" + name + "»");
377                     break;
378                 }
379             }
380             inverse.put(name, path);
381         }
382         fileSubdivisions.removeAll(toRemove, false);
383         for (Entry<String, String> entry2 : toAdd.entrySet()) {
384             fileSubdivisions.add(entry2.getKey(), entry2.getValue());
385         }
386         return fileSubdivisions;
387     }
388 
addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)389     private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures,
390         String language, String subdivision, String name) {
391         for (String s : exemplarFailures) {
392             exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name);
393         }
394     }
395 
fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)396     private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) {
397         for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) {
398             fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0());
399         }
400     }
401 
fail(String title, String lang, String subdivision, String name, int exemplarFailure)402     private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) {
403         System.out.println(title
404             + ":\t" + lang
405             + "\t" + subdivision
406             + "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»")
407             + "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure))
408             + "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "")
409             + "\t" + CldrUtility.ifNull(name, "").replace("\"", "&quot;"));
410     }
411 
412     static final List<String> MARKERS = Arrays.asList("¹", "²", "³"); // if there are more than 3 of the same kind, throw exception
413 
scriptsFor(UnicodeSet main)414     private static UnicodeSet scriptsFor(UnicodeSet main) {
415         UnicodeSet result = UnicodeSet.EMPTY;
416         for (String s : main) {
417             int scriptCode = UScript.getScript(s.codePointAt(0));
418             if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) {
419                 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode);
420                 if (scriptCode == UScript.LATIN) {
421                     result.addAll("ʻ’&");
422                 }
423                 break;
424             }
425         }
426         return result;
427     }
428 }