1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.util.Arrays; 6 import java.util.Collection; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.Iterator; 10 import java.util.List; 11 import java.util.Locale; 12 import java.util.Map; 13 import java.util.Map.Entry; 14 import java.util.Set; 15 import java.util.TreeMap; 16 import java.util.TreeSet; 17 18 import org.apache.jena.query.QuerySolution; 19 import org.apache.jena.query.ResultSet; 20 import org.unicode.cldr.draft.FileUtilities; 21 import org.unicode.cldr.rdf.QueryClient; 22 import org.unicode.cldr.rdf.TsvWriter; 23 import org.unicode.cldr.test.DisplayAndInputProcessor; 24 import org.unicode.cldr.util.CLDRConfig; 25 import org.unicode.cldr.util.CLDRFile; 26 import org.unicode.cldr.util.CLDRFile.NumberingSystem; 27 import org.unicode.cldr.util.CLDRFile.WinningChoice; 28 import org.unicode.cldr.util.CLDRPaths; 29 import org.unicode.cldr.util.ChainedMap; 30 import org.unicode.cldr.util.ChainedMap.M4; 31 import org.unicode.cldr.util.CldrUtility; 32 import org.unicode.cldr.util.Counter; 33 import org.unicode.cldr.util.Factory; 34 import org.unicode.cldr.util.SimpleXMLSource; 35 import org.unicode.cldr.util.StandardCodes.LstrType; 36 import org.unicode.cldr.util.SupplementalDataInfo; 37 import org.unicode.cldr.util.Validity; 38 import org.unicode.cldr.util.Validity.Status; 39 import org.unicode.cldr.util.XPathParts; 40 41 import com.google.common.collect.LinkedHashMultimap; 42 import com.google.common.collect.Multimap; 43 import com.google.common.collect.TreeMultimap; 44 import com.ibm.icu.impl.Row.R2; 45 import com.ibm.icu.impl.Row.R3; 46 import com.ibm.icu.impl.Row.R4; 47 import com.ibm.icu.impl.Utility; 48 import com.ibm.icu.lang.UProperty; 49 import com.ibm.icu.lang.UScript; 50 import com.ibm.icu.text.Normalizer2; 51 import com.ibm.icu.text.UTF16; 52 import com.ibm.icu.text.UnicodeSet; 53 import com.ibm.icu.util.ICUUncheckedIOException; 54 import com.ibm.icu.util.ULocale; 55 56 public final class WikiSubdivisionLanguages { 57 private static final String WIKI_SUBDIVISION_LANGUAGES_TSV = "wikiSubdivisionLanguages.tsv"; 58 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 59 static final Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular); 60 61 static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = SDI.getLocaleAliasInfo().get("subdivision"); 62 63 private static final boolean DEBUG_CONSOLE = false; 64 private static final String DEBUG_LANG_FILTER = null; // "az"; 65 66 private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\""; 67 68 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 69 private static final Normalizer2 NFC = Normalizer2.getNFCInstance(); 70 71 private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), 72 String.class); 73 private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), 74 String.class); 75 private static Set<String> bogus = new TreeSet<>(); 76 private static Multimap<Status, String> bogusStatus = TreeMultimap.create(); 77 getSubdivisionName(String subdivisionId, String languageId)78 public static String getSubdivisionName(String subdivisionId, String languageId) { 79 return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId); 80 } 81 getBestWikiEnglishName(String subdivisionId)82 public static String getBestWikiEnglishName(String subdivisionId) { 83 String languageId = "en"; 84 String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId); 85 if (name != null) { 86 return name; 87 } 88 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es"); 89 if (name != null) { 90 return name; 91 } 92 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr"); 93 if (name != null) { 94 return name; 95 } 96 Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId); 97 // try Spanish, then French, then first other 98 if (data != null) { 99 return data.entrySet().iterator().next().getValue(); // get first 100 } 101 return null; 102 } 103 104 private static final String QUERY_NAME = "wikidata-wikisubdivisionLanguages"; 105 106 //static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>(); init()107 static void init() throws IOException { 108 109 QueryClient queryClient = QueryClient.getInstance(); 110 111 System.out.println("QUERY: " + QUERY_NAME); 112 ResultSet rs = queryClient.execSelectFromSparql(QUERY_NAME, QueryClient.WIKIDATA_SPARQL_SERVER); 113 114 Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision); 115 try(PrintWriter tsv = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), WIKI_SUBDIVISION_LANGUAGES_TSV)) { 116 TsvWriter.writeRow(tsv, "item", "label", "code", "codeLabel"); 117 for (;rs.hasNext();) { 118 final QuerySolution qs = rs.next(); 119 120 String item = QueryClient.getResourceOrNull(qs, "item"); 121 String label = NFC.normalize(QueryClient.getStringOrNull(qs, "label")); 122 String code = QueryClient.getStringOrNull(qs, "code"); 123 String codeLabel = QueryClient.getStringOrNull(qs, "codeLabel"); 124 125 TsvWriter.writeRow(tsv, item, label, code, codeLabel); 126 127 String subdivision = SubdivisionNode.convertToCldr(code); 128 if (!regularSubdivisions.contains(subdivision)) { 129 Status status = codeToStatus.get(subdivision); 130 if (status == null) { 131 bogus.add(subdivision); 132 } else { 133 bogusStatus.put(status, subdivision); 134 } 135 continue; 136 } 137 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(codeLabel)) { 138 continue; 139 } 140 SUB_LANG_NAME.put(subdivision, codeLabel, label); 141 // WIKIDATA_TO_MID.put(subdivision, data.get(2)); 142 LANG_SUB_NAME.put(codeLabel, subdivision, label); 143 } 144 System.out.println("Queried " + QUERY_NAME + " at row count " + rs.getRowNumber()); 145 } 146 System.out.println("Wrote to " + WIKI_SUBDIVISION_LANGUAGES_TSV); 147 // postprocess 148 String oldLang = null; 149 DisplayAndInputProcessor daip = null; 150 Exception[] internalException = { null }; 151 152 for (R3<String, String, String> row : LANG_SUB_NAME.rows()) { 153 String lang = row.get0(); 154 String subdivision = row.get1(); 155 String name = row.get2(); 156 if (!lang.equals(oldLang)) { 157 oldLang = lang; 158 daip = new DisplayAndInputProcessor(new ULocale(lang)); 159 } 160 String path = getSubdivisionPath(subdivision); 161 String name2 = daip.processInput( 162 path, 163 name.replace("\u00AD", ""), 164 internalException); 165 if (name2.contains("'")) { 166 int debug = 0; 167 } 168 // TODO remove soft hyphen in DAIP 169 if (internalException[0] != null) { 170 throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]); 171 } else if (!name.equals(name2)) { 172 //System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2); 173 SUB_LANG_NAME.put(subdivision, lang, name2); 174 LANG_SUB_NAME.put(lang, subdivision, name2); 175 } 176 } 177 178 } 179 getSubdivisionPath(String subdivision)180 private static String getSubdivisionPath(String subdivision) { 181 return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]"; 182 } 183 getSubdivisionFromPath(String path)184 private static String getSubdivisionFromPath(String path) { 185 return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length())); 186 } 187 main(String[] args)188 public static void main(String[] args) throws IOException { 189 init(); 190 191 Counter<String> counter = new Counter<>(); 192 Factory cldrFactory = CLDR_CONFIG.getCldrFactory(); 193 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 194 CLDRFile file = null; 195 UnicodeSet exemplars = null; 196 197 ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of( 198 new TreeMap<Integer, Object>(), 199 new TreeMap<String, Object>(), 200 new TreeMap<String, Object>(), 201 String.class); 202 203 for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) { 204 String lang = entry.getKey(); 205 file = cldrFactory.make(lang, true); 206 207 CLDRFile oldFileSubdivisions; 208 try { 209 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false); 210 } catch (Exception e) { 211 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze(); 212 } 213 214 Multimap<String, String> inverse = LinkedHashMultimap.create(); 215 CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse); 216 217 UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0); 218 UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING); 219 UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING); 220 UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem); 221 exemplars = new UnicodeSet() 222 .addAll(main) 223 .addAll(auxiliary) 224 .addAll(scriptsFor(main)) // broad test,... 225 .addAll(punctuation) 226 .addAll(numbers) 227 .addAll(new UnicodeSet("[\\ ]")).freeze(); 228 229 for (Entry<String, String> entry2 : entry.getValue().entrySet()) { 230 String subdivision = entry2.getKey(); 231 String name = entry2.getValue(); 232 if (name.equals("Böyük Britaniya")) { 233 int debug = 0; 234 } 235 String path = getSubdivisionPath(subdivision); 236 String oldName = fileSubdivisions.getStringValue(path); 237 if (oldName != null) { 238 if (!oldName.equals(name)) { 239 //System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName); 240 } 241 continue; 242 } 243 if (!exemplars.containsAll(name)) { 244 UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars); 245 addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name); 246 continue; 247 } 248 fileSubdivisions.add(path, name); 249 inverse.put(name, path); 250 counter.add(lang, 1); 251 } 252 253 // We now fix collisions 254 for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) { 255 String name = entry3.getKey(); 256 if (name.isEmpty()) { 257 continue; 258 } 259 if (name.equals("Böyük Britaniya")) { 260 int debug = 0; 261 } 262 Collection<String> paths = entry3.getValue(); 263 if (paths.size() <= 1) { 264 continue; 265 } 266 if (paths.size() > 3) { 267 int debug = 0; 268 } 269 // we only care about collisions *within* a region. 270 // so group them together 271 Multimap<String, String> regionToPaths = LinkedHashMultimap.create(); 272 for (String path : paths) { 273 String sdId = getSubdivisionFromPath(path); 274 String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT); 275 regionToPaths.put(region, path); 276 } 277 278 // Now fix as necessary 279 for (Entry<String, Collection<String>> regionAndPaths : regionToPaths.asMap().entrySet()) { 280 Collection<String> paths2 = regionAndPaths.getValue(); 281 int markerIndex = 0; 282 if (paths2.size() <= 1) { 283 continue; 284 } 285 286 // find if any of the paths are deprecated 287 for (Iterator<String> it = paths2.iterator(); it.hasNext();) { 288 String path = it.next(); 289 String sdId = getSubdivisionFromPath(path); 290 if (!regularSubdivisions.contains(sdId)) { // deprecated 291 fileSubdivisions.remove(path); 292 it.remove(); 293 fail("Duplicate, not regular ", lang, getSubdivisionFromPath(path), "REMOVING", -1); 294 } 295 } 296 if (paths2.size() <= 1) { 297 continue; 298 } 299 300 String otherId = null; 301 for (String path : paths2) { 302 // if (nuke) { 303 // if (oldFileSubdivisions.getStringValue(path) == null) { 304 // fileSubdivisions.remove(path); // get rid of new ones 305 // System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name); 306 // } 307 if (markerIndex == 0) { 308 otherId = getSubdivisionFromPath(path); 309 } else { 310 String fixedName = name + MARKERS.get(markerIndex); 311 fail("Superscripting ", lang + "\t(" + otherId +")", getSubdivisionFromPath(path), fixedName, -1); 312 //System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName); 313 fileSubdivisions.add(path, fixedName); // overwrite with superscripted 314 } 315 ++markerIndex; 316 } 317 } 318 } 319 320 if (DEBUG_CONSOLE) { 321 PrintWriter pw = new PrintWriter(System.out); 322 fileSubdivisions.write(new PrintWriter(System.out)); 323 pw.flush(); 324 } else { 325 try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) { 326 fileSubdivisions.write(out); 327 } catch (Exception e) { 328 throw new ICUUncheckedIOException(e); 329 } 330 } 331 } 332 fail("ExemplarFailures", exemplarFailureLangSubdivisionName); 333 334 for (String lang : counter.getKeysetSortedByKey()) { 335 fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1); 336 } 337 System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus); 338 for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) { 339 System.out.println("SubdivisionId:\t\t" 340 + ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue()); 341 } 342 } 343 fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)344 private static CLDRFile fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) { 345 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 346 347 // for fixing collisions 348 // we first add existing items 349 Set<String> toRemove = new HashSet<>(); 350 Map<String,String> toAdd = new HashMap<>(); 351 352 for (String path : fileSubdivisions) { 353 XPathParts parts = XPathParts.getFrozenInstance(path); 354 if (!"subdivision".equals(parts.getElement(-1))) { 355 continue; 356 } 357 String name = fileSubdivisions.getStringValue(path); 358 if (name.equals("Böyük Britaniya")) { 359 int debug = 0; 360 } 361 // handle aliases also 362 String type = parts.getAttributeValue(-1, "type"); 363 R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type); 364 if (replacement != null) { 365 String fullPath = oldFileSubdivisions.getFullXPath(path); 366 XPathParts parts2 = XPathParts.getFrozenInstance(fullPath).cloneAsThawed(); 367 for (String replacementType : replacement.get0()) { 368 parts2.setAttribute(-1, "type", replacementType); 369 toRemove.add(path); 370 path = parts2.toString(); 371 toAdd.put(path, name); 372 System.out.println("Adding alias: " + replacementType + "«" + name + "»"); 373 break; 374 } 375 } 376 inverse.put(name, path); 377 } 378 fileSubdivisions.removeAll(toRemove, false); 379 for (Entry<String, String> entry2 : toAdd.entrySet()) { 380 fileSubdivisions.add(entry2.getKey(), entry2.getValue()); 381 } 382 return fileSubdivisions; 383 } 384 addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)385 private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, 386 String language, String subdivision, String name) { 387 for (String s : exemplarFailures) { 388 exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name); 389 } 390 } 391 fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)392 private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) { 393 for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) { 394 fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0()); 395 } 396 } 397 fail(String title, String lang, String subdivision, String name, int exemplarFailure)398 private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) { 399 System.out.println(title 400 + ":\t" + lang 401 + "\t" + subdivision 402 + "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»") 403 + "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure)) 404 + "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "") 405 + "\t" + CldrUtility.ifNull(name, "").replace("\"", """)); 406 } 407 408 static final List<String> MARKERS = Arrays.asList("¹", "²", "³"); // if there are more than 3 of the same kind, throw exception 409 scriptsFor(UnicodeSet main)410 private static UnicodeSet scriptsFor(UnicodeSet main) { 411 UnicodeSet result = UnicodeSet.EMPTY; 412 for (String s : main) { 413 int scriptCode = UScript.getScript(s.codePointAt(0)); 414 if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) { 415 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode); 416 if (scriptCode == UScript.LATIN) { 417 result.addAll("ʻ’&"); 418 } 419 break; 420 } 421 } 422 return result; 423 } 424 }