• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.util.Arrays;
6 import java.util.Collection;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.Iterator;
10 import java.util.List;
11 import java.util.Locale;
12 import java.util.Map;
13 import java.util.Map.Entry;
14 import java.util.Set;
15 import java.util.TreeMap;
16 import java.util.TreeSet;
17 
18 import org.apache.jena.query.QuerySolution;
19 import org.apache.jena.query.ResultSet;
20 import org.unicode.cldr.draft.FileUtilities;
21 import org.unicode.cldr.rdf.QueryClient;
22 import org.unicode.cldr.rdf.TsvWriter;
23 import org.unicode.cldr.test.DisplayAndInputProcessor;
24 import org.unicode.cldr.util.CLDRConfig;
25 import org.unicode.cldr.util.CLDRFile;
26 import org.unicode.cldr.util.CLDRFile.NumberingSystem;
27 import org.unicode.cldr.util.CLDRFile.WinningChoice;
28 import org.unicode.cldr.util.CLDRPaths;
29 import org.unicode.cldr.util.ChainedMap;
30 import org.unicode.cldr.util.ChainedMap.M4;
31 import org.unicode.cldr.util.CldrUtility;
32 import org.unicode.cldr.util.Counter;
33 import org.unicode.cldr.util.Factory;
34 import org.unicode.cldr.util.SimpleXMLSource;
35 import org.unicode.cldr.util.StandardCodes.LstrType;
36 import org.unicode.cldr.util.SupplementalDataInfo;
37 import org.unicode.cldr.util.Validity;
38 import org.unicode.cldr.util.Validity.Status;
39 import org.unicode.cldr.util.XPathParts;
40 
41 import com.google.common.collect.LinkedHashMultimap;
42 import com.google.common.collect.Multimap;
43 import com.google.common.collect.TreeMultimap;
44 import com.ibm.icu.impl.Row.R2;
45 import com.ibm.icu.impl.Row.R3;
46 import com.ibm.icu.impl.Row.R4;
47 import com.ibm.icu.impl.Utility;
48 import com.ibm.icu.lang.UProperty;
49 import com.ibm.icu.lang.UScript;
50 import com.ibm.icu.text.Normalizer2;
51 import com.ibm.icu.text.UTF16;
52 import com.ibm.icu.text.UnicodeSet;
53 import com.ibm.icu.util.ICUUncheckedIOException;
54 import com.ibm.icu.util.ULocale;
55 
56 public final class WikiSubdivisionLanguages {
57     private static final String WIKI_SUBDIVISION_LANGUAGES_TSV = "wikiSubdivisionLanguages.tsv";
58     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
59     static final Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular);
60 
61     static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = SDI.getLocaleAliasInfo().get("subdivision");
62 
63     private static final boolean DEBUG_CONSOLE = false;
64     private static final String DEBUG_LANG_FILTER = null; // "az";
65 
66     private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"";
67 
68     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
69     private static final Normalizer2 NFC = Normalizer2.getNFCInstance();
70 
71     private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(),
72         String.class);
73     private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(),
74         String.class);
75     private static Set<String> bogus = new TreeSet<>();
76     private static Multimap<Status, String> bogusStatus = TreeMultimap.create();
77 
getSubdivisionName(String subdivisionId, String languageId)78     public static String getSubdivisionName(String subdivisionId, String languageId) {
79         return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId);
80     }
81 
getBestWikiEnglishName(String subdivisionId)82     public static String getBestWikiEnglishName(String subdivisionId) {
83         String languageId = "en";
84         String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId);
85         if (name != null) {
86             return name;
87         }
88         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es");
89         if (name != null) {
90             return name;
91         }
92         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr");
93         if (name != null) {
94             return name;
95         }
96         Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId);
97         // try Spanish, then French, then first other
98         if (data != null) {
99             return data.entrySet().iterator().next().getValue(); // get first
100         }
101         return null;
102     }
103 
104     private static final String QUERY_NAME = "wikidata-wikisubdivisionLanguages";
105 
106     //static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>();
init()107     static void init() throws IOException {
108 
109         QueryClient queryClient = QueryClient.getInstance();
110 
111         System.out.println("QUERY: " + QUERY_NAME);
112         ResultSet rs = queryClient.execSelectFromSparql(QUERY_NAME, QueryClient.WIKIDATA_SPARQL_SERVER);
113 
114         Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision);
115         try(PrintWriter tsv = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), WIKI_SUBDIVISION_LANGUAGES_TSV)) {
116             TsvWriter.writeRow(tsv, "item", "label", "code", "codeLabel");
117             for (;rs.hasNext();) {
118                 final QuerySolution qs = rs.next();
119 
120                 String item = QueryClient.getResourceOrNull(qs, "item");
121                 String label = NFC.normalize(QueryClient.getStringOrNull(qs, "label"));
122                 String code = QueryClient.getStringOrNull(qs,  "code");
123                 String codeLabel = QueryClient.getStringOrNull(qs, "codeLabel");
124 
125                 TsvWriter.writeRow(tsv, item, label, code, codeLabel);
126 
127                 String subdivision = SubdivisionNode.convertToCldr(code);
128                 if (!regularSubdivisions.contains(subdivision)) {
129                     Status status = codeToStatus.get(subdivision);
130                     if (status == null) {
131                         bogus.add(subdivision);
132                     } else {
133                         bogusStatus.put(status, subdivision);
134                     }
135                     continue;
136                 }
137                 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(codeLabel)) {
138                     continue;
139                 }
140                 SUB_LANG_NAME.put(subdivision, codeLabel, label);
141     //                WIKIDATA_TO_MID.put(subdivision, data.get(2));
142                 LANG_SUB_NAME.put(codeLabel, subdivision, label);
143             }
144             System.out.println("Queried " + QUERY_NAME + " at row count " + rs.getRowNumber());
145         }
146         System.out.println("Wrote to " + WIKI_SUBDIVISION_LANGUAGES_TSV);
147         // postprocess
148         String oldLang = null;
149         DisplayAndInputProcessor daip = null;
150         Exception[] internalException = { null };
151 
152         for (R3<String, String, String> row : LANG_SUB_NAME.rows()) {
153             String lang = row.get0();
154             String subdivision = row.get1();
155             String name = row.get2();
156             if (!lang.equals(oldLang)) {
157                 oldLang = lang;
158                 daip = new DisplayAndInputProcessor(new ULocale(lang));
159             }
160             String path = getSubdivisionPath(subdivision);
161             String name2 = daip.processInput(
162                 path,
163                 name.replace("\u00AD", ""),
164                 internalException);
165             if (name2.contains("'")) {
166                 int debug = 0;
167             }
168             // TODO remove soft hyphen in DAIP
169             if (internalException[0] != null) {
170                 throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]);
171             } else if (!name.equals(name2)) {
172                 //System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2);
173                 SUB_LANG_NAME.put(subdivision, lang, name2);
174                 LANG_SUB_NAME.put(lang, subdivision, name2);
175             }
176         }
177 
178     }
179 
getSubdivisionPath(String subdivision)180     private static String getSubdivisionPath(String subdivision) {
181         return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]";
182     }
183 
getSubdivisionFromPath(String path)184     private static String getSubdivisionFromPath(String path) {
185         return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length()));
186     }
187 
main(String[] args)188     public static void main(String[] args) throws IOException {
189         init();
190 
191         Counter<String> counter = new Counter<>();
192         Factory cldrFactory = CLDR_CONFIG.getCldrFactory();
193         Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
194         CLDRFile file = null;
195         UnicodeSet exemplars = null;
196 
197         ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of(
198             new TreeMap<Integer, Object>(),
199             new TreeMap<String, Object>(),
200             new TreeMap<String, Object>(),
201             String.class);
202 
203         for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) {
204             String lang = entry.getKey();
205             file = cldrFactory.make(lang, true);
206 
207             CLDRFile oldFileSubdivisions;
208             try {
209                 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false);
210             } catch (Exception e) {
211                 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze();
212             }
213 
214             Multimap<String, String> inverse = LinkedHashMultimap.create();
215             CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse);
216 
217             UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0);
218             UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING);
219             UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING);
220             UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem);
221             exemplars = new UnicodeSet()
222                 .addAll(main)
223                 .addAll(auxiliary)
224                 .addAll(scriptsFor(main)) // broad test,...
225                 .addAll(punctuation)
226                 .addAll(numbers)
227                 .addAll(new UnicodeSet("[\\ ]")).freeze();
228 
229             for (Entry<String, String> entry2 : entry.getValue().entrySet()) {
230                 String subdivision = entry2.getKey();
231                 String name = entry2.getValue();
232                 if (name.equals("Böyük Britaniya")) {
233                     int debug = 0;
234                 }
235                 String path = getSubdivisionPath(subdivision);
236                 String oldName = fileSubdivisions.getStringValue(path);
237                 if (oldName != null) {
238                     if (!oldName.equals(name)) {
239                         //System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName);
240                     }
241                     continue;
242                 }
243                 if (!exemplars.containsAll(name)) {
244                     UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars);
245                     addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name);
246                     continue;
247                 }
248                 fileSubdivisions.add(path, name);
249                 inverse.put(name, path);
250                 counter.add(lang, 1);
251             }
252 
253             // We now fix collisions
254             for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) {
255                 String name = entry3.getKey();
256                 if (name.isEmpty()) {
257                     continue;
258                 }
259                 if (name.equals("Böyük Britaniya")) {
260                     int debug = 0;
261                 }
262                 Collection<String> paths = entry3.getValue();
263                 if (paths.size() <= 1) {
264                     continue;
265                 }
266                 if (paths.size() > 3) {
267                     int debug = 0;
268                 }
269                 // we only care about collisions *within* a region.
270                 // so group them together
271                 Multimap<String, String> regionToPaths = LinkedHashMultimap.create();
272                 for (String path : paths) {
273                     String sdId = getSubdivisionFromPath(path);
274                     String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT);
275                     regionToPaths.put(region, path);
276                 }
277 
278                 // Now fix as necessary
279                 for (Entry<String, Collection<String>> regionAndPaths : regionToPaths.asMap().entrySet()) {
280                     Collection<String> paths2 = regionAndPaths.getValue();
281                     int markerIndex = 0;
282                     if (paths2.size() <= 1) {
283                         continue;
284                     }
285 
286                     // find if any of the paths are deprecated
287                     for (Iterator<String> it = paths2.iterator(); it.hasNext();) {
288                         String path = it.next();
289                         String sdId = getSubdivisionFromPath(path);
290                         if (!regularSubdivisions.contains(sdId)) { // deprecated
291                             fileSubdivisions.remove(path);
292                             it.remove();
293                             fail("Duplicate, not regular ", lang, getSubdivisionFromPath(path), "REMOVING", -1);
294                         }
295                     }
296                     if (paths2.size() <= 1) {
297                         continue;
298                     }
299 
300                     String otherId = null;
301                     for (String path : paths2) {
302 //                    if (nuke) {
303 //                        if (oldFileSubdivisions.getStringValue(path) == null) {
304 //                            fileSubdivisions.remove(path); // get rid of new ones
305 //                            System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name);
306 //                        }
307                         if (markerIndex == 0) {
308                             otherId = getSubdivisionFromPath(path);
309                         } else {
310                             String fixedName = name + MARKERS.get(markerIndex);
311                             fail("Superscripting ", lang + "\t(" + otherId +")", getSubdivisionFromPath(path), fixedName, -1);
312                             //System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName);
313                             fileSubdivisions.add(path, fixedName); // overwrite with superscripted
314                         }
315                         ++markerIndex;
316                     }
317                 }
318             }
319 
320             if (DEBUG_CONSOLE) {
321                 PrintWriter pw = new PrintWriter(System.out);
322                 fileSubdivisions.write(new PrintWriter(System.out));
323                 pw.flush();
324             } else {
325                 try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) {
326                     fileSubdivisions.write(out);
327                 } catch (Exception e) {
328                     throw new ICUUncheckedIOException(e);
329                 }
330             }
331         }
332         fail("ExemplarFailures", exemplarFailureLangSubdivisionName);
333 
334         for (String lang : counter.getKeysetSortedByKey()) {
335             fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1);
336         }
337         System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus);
338         for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) {
339             System.out.println("SubdivisionId:\t\t"
340                 + ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue());
341         }
342     }
343 
fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)344     private static CLDRFile fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) {
345         CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
346 
347         // for fixing collisions
348         // we first add existing items
349         Set<String> toRemove = new HashSet<>();
350         Map<String,String> toAdd = new HashMap<>();
351 
352         for (String path : fileSubdivisions) {
353             XPathParts parts = XPathParts.getFrozenInstance(path);
354             if (!"subdivision".equals(parts.getElement(-1))) {
355                 continue;
356             }
357             String name = fileSubdivisions.getStringValue(path);
358             if (name.equals("Böyük Britaniya")) {
359                 int debug = 0;
360             }
361             // handle aliases also
362             String type = parts.getAttributeValue(-1, "type");
363             R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type);
364             if (replacement != null) {
365                 String fullPath = oldFileSubdivisions.getFullXPath(path);
366                 XPathParts parts2 = XPathParts.getFrozenInstance(fullPath).cloneAsThawed();
367                 for (String replacementType : replacement.get0()) {
368                     parts2.setAttribute(-1, "type", replacementType);
369                     toRemove.add(path);
370                     path = parts2.toString();
371                     toAdd.put(path, name);
372                     System.out.println("Adding alias: " + replacementType + "«" + name + "»");
373                     break;
374                 }
375             }
376             inverse.put(name, path);
377         }
378         fileSubdivisions.removeAll(toRemove, false);
379         for (Entry<String, String> entry2 : toAdd.entrySet()) {
380             fileSubdivisions.add(entry2.getKey(), entry2.getValue());
381         }
382         return fileSubdivisions;
383     }
384 
addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)385     private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures,
386         String language, String subdivision, String name) {
387         for (String s : exemplarFailures) {
388             exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name);
389         }
390     }
391 
fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)392     private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) {
393         for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) {
394             fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0());
395         }
396     }
397 
fail(String title, String lang, String subdivision, String name, int exemplarFailure)398     private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) {
399         System.out.println(title
400             + ":\t" + lang
401             + "\t" + subdivision
402             + "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»")
403             + "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure))
404             + "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "")
405             + "\t" + CldrUtility.ifNull(name, "").replace("\"", "&quot;"));
406     }
407 
408     static final List<String> MARKERS = Arrays.asList("¹", "²", "³"); // if there are more than 3 of the same kind, throw exception
409 
scriptsFor(UnicodeSet main)410     private static UnicodeSet scriptsFor(UnicodeSet main) {
411         UnicodeSet result = UnicodeSet.EMPTY;
412         for (String s : main) {
413             int scriptCode = UScript.getScript(s.codePointAt(0));
414             if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) {
415                 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode);
416                 if (scriptCode == UScript.LATIN) {
417                     result.addAll("ʻ’&");
418                 }
419                 break;
420             }
421         }
422         return result;
423     }
424 }