1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.util.Arrays; 7 import java.util.Collection; 8 import java.util.HashMap; 9 import java.util.HashSet; 10 import java.util.Iterator; 11 import java.util.List; 12 import java.util.Locale; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Set; 16 import java.util.TreeMap; 17 import java.util.TreeSet; 18 19 import org.apache.jena.query.QuerySolution; 20 import org.apache.jena.query.ResultSet; 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.rdf.QueryClient; 23 import org.unicode.cldr.rdf.TsvWriter; 24 import org.unicode.cldr.test.DisplayAndInputProcessor; 25 import org.unicode.cldr.tool.SubdivisionNode; 26 import org.unicode.cldr.util.CLDRConfig; 27 import org.unicode.cldr.util.CLDRFile; 28 import org.unicode.cldr.util.CLDRPaths; 29 import org.unicode.cldr.util.ChainedMap; 30 import org.unicode.cldr.util.CldrUtility; 31 import org.unicode.cldr.util.Counter; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.SimpleXMLSource; 34 import org.unicode.cldr.util.SupplementalDataInfo; 35 import org.unicode.cldr.util.Validity; 36 import org.unicode.cldr.util.XPathParts; 37 import org.unicode.cldr.util.CLDRFile.NumberingSystem; 38 import org.unicode.cldr.util.CLDRFile.WinningChoice; 39 import org.unicode.cldr.util.ChainedMap.M3; 40 import org.unicode.cldr.util.ChainedMap.M4; 41 import org.unicode.cldr.util.StandardCodes.LstrType; 42 import org.unicode.cldr.util.Validity.Status; 43 44 import com.google.common.base.Splitter; 45 import com.google.common.collect.LinkedHashMultimap; 46 import com.google.common.collect.Multimap; 47 import com.google.common.collect.TreeMultimap; 48 import com.ibm.icu.impl.Row.R2; 49 import com.ibm.icu.impl.Row.R3; 50 import com.ibm.icu.impl.Row.R4; 51 import com.ibm.icu.impl.Utility; 52 import com.ibm.icu.lang.UProperty; 53 import com.ibm.icu.lang.UScript; 54 import com.ibm.icu.text.Normalizer2; 55 import com.ibm.icu.text.UTF16; 56 import com.ibm.icu.text.UnicodeSet; 57 import com.ibm.icu.util.ICUUncheckedIOException; 58 import com.ibm.icu.util.ULocale; 59 60 public final class WikiSubdivisionLanguages { 61 private static final String WIKI_SUBDIVISION_LANGUAGES_TSV = "wikiSubdivisionLanguages.tsv"; 62 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 63 static final Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular); 64 65 static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = SDI.getLocaleAliasInfo().get("subdivision"); 66 67 private static final boolean DEBUG_CONSOLE = false; 68 private static final String DEBUG_LANG_FILTER = null; // "az"; 69 70 private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\""; 71 72 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 73 private static final Normalizer2 NFC = Normalizer2.getNFCInstance(); 74 75 private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), 76 String.class); 77 private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), 78 String.class); 79 private static Set<String> bogus = new TreeSet<>(); 80 private static Multimap<Status, String> bogusStatus = TreeMultimap.create(); 81 getSubdivisionName(String subdivisionId, String languageId)82 public static String getSubdivisionName(String subdivisionId, String languageId) { 83 return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId); 84 } 85 getBestWikiEnglishName(String subdivisionId)86 public static String getBestWikiEnglishName(String subdivisionId) { 87 String languageId = "en"; 88 String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId); 89 if (name != null) { 90 return name; 91 } 92 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es"); 93 if (name != null) { 94 return name; 95 } 96 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr"); 97 if (name != null) { 98 return name; 99 } 100 Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId); 101 // try Spanish, then French, then first other 102 if (data != null) { 103 return data.entrySet().iterator().next().getValue(); // get first 104 } 105 return null; 106 } 107 108 private static final String QUERY_NAME = "wikidata-wikisubdivisionLanguages"; 109 110 //static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>(); init()111 static void init() throws IOException { 112 113 QueryClient queryClient = QueryClient.getInstance(); 114 115 System.out.println("QUERY: " + QUERY_NAME); 116 ResultSet rs = queryClient.execSelectFromSparql(QUERY_NAME, QueryClient.WIKIDATA_SPARQL_SERVER); 117 118 Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision); 119 try(PrintWriter tsv = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), WIKI_SUBDIVISION_LANGUAGES_TSV)) { 120 TsvWriter.writeRow(tsv, "item", "label", "code", "codeLabel"); 121 for (;rs.hasNext();) { 122 final QuerySolution qs = rs.next(); 123 124 String item = QueryClient.getResourceOrNull(qs, "item"); 125 String label = NFC.normalize(QueryClient.getStringOrNull(qs, "label")); 126 String code = QueryClient.getStringOrNull(qs, "code"); 127 String codeLabel = QueryClient.getStringOrNull(qs, "codeLabel"); 128 129 TsvWriter.writeRow(tsv, item, label, code, codeLabel); 130 131 String subdivision = SubdivisionNode.convertToCldr(code); 132 if (!regularSubdivisions.contains(subdivision)) { 133 Status status = codeToStatus.get(subdivision); 134 if (status == null) { 135 bogus.add(subdivision); 136 } else { 137 bogusStatus.put(status, subdivision); 138 } 139 continue; 140 } 141 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(codeLabel)) { 142 continue; 143 } 144 SUB_LANG_NAME.put(subdivision, codeLabel, label); 145 // WIKIDATA_TO_MID.put(subdivision, data.get(2)); 146 LANG_SUB_NAME.put(codeLabel, subdivision, label); 147 } 148 System.out.println("Queried " + QUERY_NAME + " at row count " + rs.getRowNumber()); 149 } 150 System.out.println("Wrote to " + WIKI_SUBDIVISION_LANGUAGES_TSV); 151 // postprocess 152 String oldLang = null; 153 DisplayAndInputProcessor daip = null; 154 Exception[] internalException = { null }; 155 156 for (R3<String, String, String> row : LANG_SUB_NAME.rows()) { 157 String lang = row.get0(); 158 String subdivision = row.get1(); 159 String name = row.get2(); 160 if (!lang.equals(oldLang)) { 161 oldLang = lang; 162 daip = new DisplayAndInputProcessor(new ULocale(lang)); 163 } 164 String path = getSubdivisionPath(subdivision); 165 String name2 = daip.processInput( 166 path, 167 name.replace("\u00AD", ""), 168 internalException); 169 if (name2.contains("'")) { 170 int debug = 0; 171 } 172 // TODO remove soft hyphen in DAIP 173 if (internalException[0] != null) { 174 throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]); 175 } else if (!name.equals(name2)) { 176 //System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2); 177 SUB_LANG_NAME.put(subdivision, lang, name2); 178 LANG_SUB_NAME.put(lang, subdivision, name2); 179 } 180 } 181 182 } 183 getSubdivisionPath(String subdivision)184 private static String getSubdivisionPath(String subdivision) { 185 return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]"; 186 } 187 getSubdivisionFromPath(String path)188 private static String getSubdivisionFromPath(String path) { 189 return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length())); 190 } 191 main(String[] args)192 public static void main(String[] args) throws IOException { 193 init(); 194 195 Counter<String> counter = new Counter<>(); 196 Factory cldrFactory = CLDR_CONFIG.getCldrFactory(); 197 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 198 CLDRFile file = null; 199 UnicodeSet exemplars = null; 200 201 ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of( 202 new TreeMap<Integer, Object>(), 203 new TreeMap<String, Object>(), 204 new TreeMap<String, Object>(), 205 String.class); 206 207 for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) { 208 String lang = entry.getKey(); 209 file = cldrFactory.make(lang, true); 210 211 CLDRFile oldFileSubdivisions; 212 try { 213 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false); 214 } catch (Exception e) { 215 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze(); 216 } 217 218 Multimap<String, String> inverse = LinkedHashMultimap.create(); 219 CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse); 220 221 UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0); 222 UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING); 223 UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING); 224 UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem); 225 exemplars = new UnicodeSet() 226 .addAll(main) 227 .addAll(auxiliary) 228 .addAll(scriptsFor(main)) // broad test,... 229 .addAll(punctuation) 230 .addAll(numbers) 231 .addAll(new UnicodeSet("[\\ ]")).freeze(); 232 233 for (Entry<String, String> entry2 : entry.getValue().entrySet()) { 234 String subdivision = entry2.getKey(); 235 String name = entry2.getValue(); 236 if (name.equals("Böyük Britaniya")) { 237 int debug = 0; 238 } 239 String path = getSubdivisionPath(subdivision); 240 String oldName = fileSubdivisions.getStringValue(path); 241 if (oldName != null) { 242 if (!oldName.equals(name)) { 243 //System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName); 244 } 245 continue; 246 } 247 if (!exemplars.containsAll(name)) { 248 UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars); 249 addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name); 250 continue; 251 } 252 fileSubdivisions.add(path, name); 253 inverse.put(name, path); 254 counter.add(lang, 1); 255 } 256 257 // We now fix collisions 258 for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) { 259 String name = entry3.getKey(); 260 if (name.isEmpty()) { 261 continue; 262 } 263 if (name.equals("Böyük Britaniya")) { 264 int debug = 0; 265 } 266 Collection<String> paths = entry3.getValue(); 267 if (paths.size() <= 1) { 268 continue; 269 } 270 if (paths.size() > 3) { 271 int debug = 0; 272 } 273 // we only care about collisions *within* a region. 274 // so group them together 275 Multimap<String, String> regionToPaths = LinkedHashMultimap.create(); 276 for (String path : paths) { 277 String sdId = getSubdivisionFromPath(path); 278 String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT); 279 regionToPaths.put(region, path); 280 } 281 282 // Now fix as necessary 283 for (Entry<String, Collection<String>> regionAndPaths : regionToPaths.asMap().entrySet()) { 284 Collection<String> paths2 = regionAndPaths.getValue(); 285 int markerIndex = 0; 286 if (paths2.size() <= 1) { 287 continue; 288 } 289 290 // find if any of the paths are deprecated 291 for (Iterator<String> it = paths2.iterator(); it.hasNext();) { 292 String path = it.next(); 293 String sdId = getSubdivisionFromPath(path); 294 if (!regularSubdivisions.contains(sdId)) { // deprecated 295 fileSubdivisions.remove(path); 296 it.remove(); 297 fail("Duplicate, not regular ", lang, getSubdivisionFromPath(path), "REMOVING", -1); 298 } 299 } 300 if (paths2.size() <= 1) { 301 continue; 302 } 303 304 String otherId = null; 305 for (String path : paths2) { 306 // if (nuke) { 307 // if (oldFileSubdivisions.getStringValue(path) == null) { 308 // fileSubdivisions.remove(path); // get rid of new ones 309 // System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name); 310 // } 311 if (markerIndex == 0) { 312 otherId = getSubdivisionFromPath(path); 313 } else { 314 String fixedName = name + MARKERS.get(markerIndex); 315 fail("Superscripting ", lang + "\t(" + otherId +")", getSubdivisionFromPath(path), fixedName, -1); 316 //System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName); 317 fileSubdivisions.add(path, fixedName); // overwrite with superscripted 318 } 319 ++markerIndex; 320 } 321 } 322 } 323 324 if (DEBUG_CONSOLE) { 325 PrintWriter pw = new PrintWriter(System.out); 326 fileSubdivisions.write(new PrintWriter(System.out)); 327 pw.flush(); 328 } else { 329 try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) { 330 fileSubdivisions.write(out); 331 } catch (Exception e) { 332 throw new ICUUncheckedIOException(e); 333 } 334 } 335 } 336 fail("ExemplarFailures", exemplarFailureLangSubdivisionName); 337 338 for (String lang : counter.getKeysetSortedByKey()) { 339 fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1); 340 } 341 System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus); 342 for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) { 343 System.out.println("SubdivisionId:\t\t" 344 + ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue()); 345 } 346 } 347 fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)348 private static CLDRFile fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) { 349 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 350 351 // for fixing collisions 352 // we first add existing items 353 Set<String> toRemove = new HashSet<>(); 354 Map<String,String> toAdd = new HashMap<>(); 355 356 for (String path : fileSubdivisions) { 357 XPathParts parts = XPathParts.getFrozenInstance(path); 358 if (!"subdivision".equals(parts.getElement(-1))) { 359 continue; 360 } 361 String name = fileSubdivisions.getStringValue(path); 362 if (name.equals("Böyük Britaniya")) { 363 int debug = 0; 364 } 365 // handle aliases also 366 String type = parts.getAttributeValue(-1, "type"); 367 R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type); 368 if (replacement != null) { 369 String fullPath = oldFileSubdivisions.getFullXPath(path); 370 XPathParts parts2 = XPathParts.getFrozenInstance(fullPath).cloneAsThawed(); 371 for (String replacementType : replacement.get0()) { 372 parts2.setAttribute(-1, "type", replacementType); 373 toRemove.add(path); 374 path = parts2.toString(); 375 toAdd.put(path, name); 376 System.out.println("Adding alias: " + replacementType + "«" + name + "»"); 377 break; 378 } 379 } 380 inverse.put(name, path); 381 } 382 fileSubdivisions.removeAll(toRemove, false); 383 for (Entry<String, String> entry2 : toAdd.entrySet()) { 384 fileSubdivisions.add(entry2.getKey(), entry2.getValue()); 385 } 386 return fileSubdivisions; 387 } 388 addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)389 private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, 390 String language, String subdivision, String name) { 391 for (String s : exemplarFailures) { 392 exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name); 393 } 394 } 395 fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)396 private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) { 397 for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) { 398 fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0()); 399 } 400 } 401 fail(String title, String lang, String subdivision, String name, int exemplarFailure)402 private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) { 403 System.out.println(title 404 + ":\t" + lang 405 + "\t" + subdivision 406 + "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»") 407 + "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure)) 408 + "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "") 409 + "\t" + CldrUtility.ifNull(name, "").replace("\"", """)); 410 } 411 412 static final List<String> MARKERS = Arrays.asList("¹", "²", "³"); // if there are more than 3 of the same kind, throw exception 413 scriptsFor(UnicodeSet main)414 private static UnicodeSet scriptsFor(UnicodeSet main) { 415 UnicodeSet result = UnicodeSet.EMPTY; 416 for (String s : main) { 417 int scriptCode = UScript.getScript(s.codePointAt(0)); 418 if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) { 419 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode); 420 if (scriptCode == UScript.LATIN) { 421 result.addAll("ʻ’&"); 422 } 423 break; 424 } 425 } 426 return result; 427 } 428 }