1 package org.unicode.cldr.util; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.util.Arrays; 7 import java.util.Collection; 8 import java.util.HashMap; 9 import java.util.HashSet; 10 import java.util.Iterator; 11 import java.util.List; 12 import java.util.Locale; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Set; 16 import java.util.TreeMap; 17 import java.util.TreeSet; 18 19 import org.unicode.cldr.draft.FileUtilities; 20 import org.unicode.cldr.test.DisplayAndInputProcessor; 21 import org.unicode.cldr.tool.SubdivisionNode; 22 import org.unicode.cldr.util.CLDRFile.NumberingSystem; 23 import org.unicode.cldr.util.CLDRFile.WinningChoice; 24 import org.unicode.cldr.util.ChainedMap.M4; 25 import org.unicode.cldr.util.StandardCodes.LstrType; 26 import org.unicode.cldr.util.Validity.Status; 27 28 import com.google.common.base.Splitter; 29 import com.google.common.collect.LinkedHashMultimap; 30 import com.google.common.collect.Multimap; 31 import com.google.common.collect.TreeMultimap; 32 import com.ibm.icu.impl.Row.R2; 33 import com.ibm.icu.impl.Row.R3; 34 import com.ibm.icu.impl.Row.R4; 35 import com.ibm.icu.impl.Utility; 36 import com.ibm.icu.lang.UProperty; 37 import com.ibm.icu.lang.UScript; 38 import com.ibm.icu.text.Normalizer2; 39 import com.ibm.icu.text.UTF16; 40 import com.ibm.icu.text.UnicodeSet; 41 import com.ibm.icu.util.ICUUncheckedIOException; 42 import com.ibm.icu.util.ULocale; 43 44 public final class WikiSubdivisionLanguages { 45 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 46 static final Set<String> regularSubdivisions = Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular); 47 48 static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = SDI.getLocaleAliasInfo().get("subdivision"); 49 50 private static final boolean DEBUG_CONSOLE = false; 51 private static final String DEBUG_LANG_FILTER = null; // "az"; 52 53 private static final String BEFORE_TYPE = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\""; 54 55 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 56 private static final Normalizer2 NFC = Normalizer2.getNFCInstance(); 57 58 enum Items { 59 // http://www.wikidata.org/entity/Q24260 كانيلو AD-02 ar 60 wid, translation, subdivisionId, languageId 61 } 62 63 private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), 64 String.class); 65 private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = ChainedMap.of(new TreeMap<String, Object>(), new TreeMap<String, Object>(), 66 String.class); 67 private static Set<String> bogus = new TreeSet<>(); 68 private static Multimap<Status, String> bogusStatus = TreeMultimap.create(); 69 getSubdivisionName(String subdivisionId, String languageId)70 public static String getSubdivisionName(String subdivisionId, String languageId) { 71 return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId); 72 } 73 getBestWikiEnglishName(String subdivisionId)74 public static String getBestWikiEnglishName(String subdivisionId) { 75 String languageId = "en"; 76 String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId); 77 if (name != null) { 78 return name; 79 } 80 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es"); 81 if (name != null) { 82 return name; 83 } 84 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr"); 85 if (name != null) { 86 return name; 87 } 88 Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId); 89 // try Spanish, then French, then first other 90 if (data != null) { 91 return data.entrySet().iterator().next().getValue(); // get first 92 } 93 return null; 94 } 95 96 //static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>(); 97 static { 98 Splitter TAB = Splitter.on('\t').trimResults(); 99 File file = new File("data/external", "wikiSubdivisionLanguages.tsv"); 100 try { file.getCanonicalFile()101 System.out.println(file.getCanonicalFile()); 102 } catch (IOException e) { 103 e.printStackTrace(); 104 } 105 Map<String, Status> codeToStatus = Validity.getInstance().getCodeToStatus(LstrType.subdivision); 106 107 for (String line : FileUtilities.in(WikiSubdivisionLanguages.class, "data/external/wikiSubdivisionLanguages.tsv")) { 108 109 List<String> data = TAB.splitToList(line); 110 String subdivision = SubdivisionNode.convertToCldr(data.get(Items.subdivisionId.ordinal())); 111 if (!regularSubdivisions.contains(subdivision)) { 112 Status status = codeToStatus.get(subdivision); 113 if (status == null) { 114 bogus.add(subdivision); 115 } else { bogusStatus.put(status, subdivision)116 bogusStatus.put(status, subdivision); 117 } 118 continue; 119 } 120 String lang = data.get(Items.languageId.ordinal()); 121 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(lang)) { 122 continue; 123 } 124 String name = NFC.normalize(data.get(Items.translation.ordinal())); SUB_LANG_NAME.put(subdivision, lang, name)125 SUB_LANG_NAME.put(subdivision, lang, name); 126 // WIKIDATA_TO_MID.put(subdivision, data.get(2)); LANG_SUB_NAME.put(lang, subdivision, name)127 LANG_SUB_NAME.put(lang, subdivision, name); 128 } 129 // postprocess 130 String oldLang = null; 131 DisplayAndInputProcessor daip = null; 132 Exception[] internalException = { null }; 133 134 for (R3<String, String, String> row : LANG_SUB_NAME.rows()) { 135 String lang = row.get0(); 136 String subdivision = row.get1(); 137 String name = row.get2(); 138 if (!lang.equals(oldLang)) { 139 oldLang = lang; 140 daip = new DisplayAndInputProcessor(new ULocale(lang)); 141 } 142 String path = getSubdivisionPath(subdivision); 143 String name2 = daip.processInput( 144 path, 145 name.replace("\u00AD", ""), 146 internalException); 147 if (name2.contains("'")) { 148 int debug = 0; 149 } 150 // TODO remove soft hyphen in DAIP 151 if (internalException[0] != null) { 152 throw new IllegalArgumentException(lang + "\t" + subdivision + "\t" + name, internalException[0]); 153 } else if (!name.equals(name2)) { 154 //System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2); SUB_LANG_NAME.put(subdivision, lang, name2)155 SUB_LANG_NAME.put(subdivision, lang, name2); LANG_SUB_NAME.put(lang, subdivision, name2)156 LANG_SUB_NAME.put(lang, subdivision, name2); 157 } 158 } 159 160 } 161 getSubdivisionPath(String subdivision)162 private static String getSubdivisionPath(String subdivision) { 163 return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]"; 164 } 165 getSubdivisionFromPath(String path)166 private static String getSubdivisionFromPath(String path) { 167 return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length())); 168 } 169 main(String[] args)170 public static void main(String[] args) { 171 Counter<String> counter = new Counter<>(); 172 Factory cldrFactory = CLDR_CONFIG.getCldrFactory(); 173 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 174 CLDRFile file = null; 175 UnicodeSet exemplars = null; 176 177 ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = ChainedMap.of( 178 new TreeMap<Integer, Object>(), 179 new TreeMap<String, Object>(), 180 new TreeMap<String, Object>(), 181 String.class); 182 183 for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) { 184 String lang = entry.getKey(); 185 file = cldrFactory.make(lang, true); 186 187 CLDRFile oldFileSubdivisions; 188 try { 189 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false); 190 } catch (Exception e) { 191 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze(); 192 } 193 194 Multimap<String, String> inverse = LinkedHashMultimap.create(); 195 CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse); 196 197 UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0); 198 UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING); 199 UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING); 200 UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem); 201 exemplars = new UnicodeSet() 202 .addAll(main) 203 .addAll(auxiliary) 204 .addAll(scriptsFor(main)) // broad test,... 205 .addAll(punctuation) 206 .addAll(numbers) 207 .addAll(new UnicodeSet("[\\ ]")).freeze(); 208 209 for (Entry<String, String> entry2 : entry.getValue().entrySet()) { 210 String subdivision = entry2.getKey(); 211 String name = entry2.getValue(); 212 if (name.equals("Böyük Britaniya")) { 213 int debug = 0; 214 } 215 String path = getSubdivisionPath(subdivision); 216 String oldName = fileSubdivisions.getStringValue(path); 217 if (oldName != null) { 218 if (!oldName.equals(name)) { 219 //System.out.println("Already has translation\t" + lang + "\t" + subdivision + "\t" + name + "\t" + oldName); 220 } 221 continue; 222 } 223 if (!exemplars.containsAll(name)) { 224 UnicodeSet exemplarFailures = new UnicodeSet().addAll(name).removeAll(exemplars); 225 addExemplarFailures(exemplarFailureLangSubdivisionName, exemplarFailures, lang, subdivision, name); 226 continue; 227 } 228 fileSubdivisions.add(path, name); 229 inverse.put(name, path); 230 counter.add(lang, 1); 231 } 232 233 // We now fix collisions 234 for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) { 235 String name = entry3.getKey(); 236 if (name.isEmpty()) { 237 continue; 238 } 239 if (name.equals("Böyük Britaniya")) { 240 int debug = 0; 241 } 242 Collection<String> paths = entry3.getValue(); 243 if (paths.size() <= 1) { 244 continue; 245 } 246 if (paths.size() > 3) { 247 int debug = 0; 248 } 249 // we only care about collisions *within* a region. 250 // so group them together 251 Multimap<String, String> regionToPaths = LinkedHashMultimap.create(); 252 for (String path : paths) { 253 String sdId = getSubdivisionFromPath(path); 254 String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT); 255 regionToPaths.put(region, path); 256 } 257 258 // Now fix as necessary 259 for (Entry<String, Collection<String>> regionAndPaths : regionToPaths.asMap().entrySet()) { 260 Collection<String> paths2 = regionAndPaths.getValue(); 261 int markerIndex = 0; 262 if (paths2.size() <= 1) { 263 continue; 264 } 265 266 // find if any of the paths are deprecated 267 for (Iterator<String> it = paths2.iterator(); it.hasNext();) { 268 String path = it.next(); 269 String sdId = getSubdivisionFromPath(path); 270 if (!regularSubdivisions.contains(sdId)) { // deprecated 271 fileSubdivisions.remove(path); 272 it.remove(); 273 fail("Duplicate, not regular ", lang, getSubdivisionFromPath(path), "REMOVING", -1); 274 } 275 } 276 if (paths2.size() <= 1) { 277 continue; 278 } 279 280 String otherId = null; 281 for (String path : paths2) { 282 // if (nuke) { 283 // if (oldFileSubdivisions.getStringValue(path) == null) { 284 // fileSubdivisions.remove(path); // get rid of new ones 285 // System.out.println("Removing colliding " + lang + "\t" + path + "\t" + name); 286 // } 287 if (markerIndex == 0) { 288 otherId = getSubdivisionFromPath(path); 289 } else { 290 String fixedName = name + MARKERS.get(markerIndex); 291 fail("Superscripting ", lang + "\t(" + otherId +")", getSubdivisionFromPath(path), fixedName, -1); 292 //System.out.println("Superscripting colliding:\t" + lang + "\t" + path + "\t" + fixedName); 293 fileSubdivisions.add(path, fixedName); // overwrite with superscripted 294 } 295 ++markerIndex; 296 } 297 } 298 } 299 300 if (DEBUG_CONSOLE) { 301 PrintWriter pw = new PrintWriter(System.out); 302 fileSubdivisions.write(new PrintWriter(System.out)); 303 pw.flush(); 304 } else { 305 try (PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) { 306 fileSubdivisions.write(out); 307 } catch (Exception e) { 308 throw new ICUUncheckedIOException(e); 309 } 310 } 311 } 312 fail("ExemplarFailures", exemplarFailureLangSubdivisionName); 313 314 for (String lang : counter.getKeysetSortedByKey()) { 315 fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1); 316 } 317 System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus); 318 for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) { 319 System.out.println("SubdivisionId:\t\t" 320 + ":\t" + entry.getKey() + "\t" + entry.getValue().size() + "\t" + entry.getValue()); 321 } 322 } 323 fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)324 private static CLDRFile fixedFile(CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) { 325 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 326 327 // for fixing collisions 328 // we first add existing items 329 Set<String> toRemove = new HashSet<>(); 330 Map<String,String> toAdd = new HashMap<>(); 331 332 for (String path : fileSubdivisions) { 333 XPathParts parts = XPathParts.getFrozenInstance(path); 334 if (!"subdivision".equals(parts.getElement(-1))) { 335 continue; 336 } 337 String name = fileSubdivisions.getStringValue(path); 338 if (name.equals("Böyük Britaniya")) { 339 int debug = 0; 340 } 341 // handle aliases also 342 String type = parts.getAttributeValue(-1, "type"); 343 R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type); 344 if (replacement != null) { 345 String fullPath = oldFileSubdivisions.getFullXPath(path); 346 XPathParts parts2 = XPathParts.getInstance(fullPath); 347 for (String replacementType : replacement.get0()) { 348 parts2.setAttribute(-1, "type", replacementType); 349 toRemove.add(path); 350 path = parts2.toString(); 351 toAdd.put(path, name); 352 System.out.println("Adding alias: " + replacementType + "«" + name + "»"); 353 break; 354 } 355 } 356 inverse.put(name, path); 357 } 358 fileSubdivisions.removeAll(toRemove, false); 359 for (Entry<String, String> entry2 : toAdd.entrySet()) { 360 fileSubdivisions.add(entry2.getKey(), entry2.getValue()); 361 } 362 return fileSubdivisions; 363 } 364 addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)365 private static void addExemplarFailures(M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, 366 String language, String subdivision, String name) { 367 for (String s : exemplarFailures) { 368 exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name); 369 } 370 } 371 fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)372 private static void fail(String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) { 373 for (R4<Integer, String, String, String> entry : exemplarFailureLangSubdivisionName.rows()) { 374 fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0()); 375 } 376 } 377 fail(String title, String lang, String subdivision, String name, int exemplarFailure)378 private static void fail(String title, String lang, String subdivision, String name, int exemplarFailure) { 379 System.out.println(title 380 + ":\t" + lang 381 + "\t" + subdivision 382 + "\t" + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»") 383 + "\t" + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure)) 384 + "\t" + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "") 385 + "\t" + CldrUtility.ifNull(name, "").replace("\"", """)); 386 } 387 388 static final List<String> MARKERS = Arrays.asList("¹", "²", "³"); // if there are more than 3 of the same kind, throw exception 389 scriptsFor(UnicodeSet main)390 private static UnicodeSet scriptsFor(UnicodeSet main) { 391 UnicodeSet result = UnicodeSet.EMPTY; 392 for (String s : main) { 393 int scriptCode = UScript.getScript(s.codePointAt(0)); 394 if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) { 395 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode); 396 if (scriptCode == UScript.LATIN) { 397 result.addAll("ʻ’&"); 398 } 399 break; 400 } 401 } 402 return result; 403 } 404 }