1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.io.Writer; 6 import java.util.ArrayList; 7 import java.util.Arrays; 8 import java.util.Collection; 9 import java.util.Collections; 10 import java.util.HashSet; 11 import java.util.LinkedHashSet; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Set; 16 import java.util.TreeMap; 17 import java.util.TreeSet; 18 import java.util.function.Consumer; 19 import java.util.function.Function; 20 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.util.CLDRConfig; 23 import org.unicode.cldr.util.CLDRFile; 24 import org.unicode.cldr.util.CLDRPaths; 25 import org.unicode.cldr.util.Containment; 26 import org.unicode.cldr.util.DtdType; 27 import org.unicode.cldr.util.Iso639Data; 28 import org.unicode.cldr.util.Iso639Data.Type; 29 import org.unicode.cldr.util.SimpleXMLSource; 30 import org.unicode.cldr.util.StandardCodes; 31 import org.unicode.cldr.util.StandardCodes.LstrField; 32 import org.unicode.cldr.util.StandardCodes.LstrType; 33 import org.unicode.cldr.util.Validity; 34 import org.unicode.cldr.util.Validity.Status; 35 36 import com.google.common.base.Joiner; 37 import com.google.common.base.Splitter; 38 import com.google.common.collect.ImmutableMap; 39 import com.google.common.collect.ImmutableMultimap; 40 import com.google.common.collect.ImmutableSet; 41 import com.google.common.collect.ImmutableSet.Builder; 42 import com.google.common.collect.LinkedHashMultimap; 43 import com.google.common.collect.Multimap; 44 import com.google.common.collect.Multimaps; 45 import com.google.common.collect.TreeMultimap; 46 import com.ibm.icu.impl.Row.R2; 47 import com.ibm.icu.util.ICUUncheckedIOException; 48 49 public class GenerateLanguageContainment { 50 private static final boolean ONLY_LIVING = false; 51 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 52 static final Splitter TAB = Splitter.on('\t').trimResults(); 53 static final CLDRFile ENGLISH = CONFIG.getEnglish(); 54 static final String relDir = "../util/data/languages/"; 55 static final Map<String, R2<List<String>, String>> ALIAS_MAP = CONFIG 56 .getSupplementalDataInfo() 57 .getLocaleAliasInfo() 58 .get("language"); 59 static final Map<String, String> entityToLabel = loadTsvPairsUnique(GenerateLanguageContainment.class, relDir + "entityToLabel.tsv", 60 null, null, null); 61 62 static final Function<String, String> NAME = code -> code.equals("mul") ? "root" : ENGLISH.getName(code) + " (" + code + ")"; 63 64 static final Map<String, String> entityToCode = loadTsvPairsUnique(GenerateLanguageContainment.class, relDir + "entityToCode.tsv", 65 code -> { 66 code = code.replace("\"", ""); 67 R2<List<String>, String> v = ALIAS_MAP.get(code); 68 String result = v == null 69 ? code : v.get0().get(0); 70 result = result.contains("_") 71 ? code 72 : result; 73 return result; 74 }, 75 null, NAME); 76 77 static final Multimap<String, String> codeToEntity = ImmutableMultimap.copyOf( 78 Multimaps.invertFrom(Multimaps.forMap(entityToCode), LinkedHashMultimap.create())); 79 80 static final Multimap<String, String> childToParent = loadTsvPairs(GenerateLanguageContainment.class, relDir + "childToParent.tsv", 81 code -> getEntityName(code), code -> getEntityName(code)); 82 83 static final Set<String> COLLECTIONS; 84 static { 85 Map<String, Map<LstrField, String>> languages = StandardCodes.getEnumLstreg().get(LstrType.language); 86 Builder<String> _collections = ImmutableSet.<String> builder(); 87 for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) { 88 String scope = e.getValue().get(LstrField.Scope); 89 if (scope != null 90 && "Collection".equalsIgnoreCase(scope)) { e.getKey()91 _collections.add(e.getKey()); 92 } 93 } 94 COLLECTIONS = _collections.build(); 95 } 96 97 static class Tree { 98 Set<String> leaves = new LinkedHashSet<>(); 99 add(List<String> chain)100 void add(List<String> chain) { 101 Collections.reverse(chain); 102 } 103 } 104 105 static final Multimap<String, String> EXTRA_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder() 106 .put("mul", "art") // we add art programmatically 107 .put("gmw", "ksh") 108 .put("gmw", "wae") 109 .put("mul", "tai") 110 .put("tai", "th") 111 .put("tai", "lo") 112 .put("roa", "cpf") 113 .put("roa", "cpp") 114 .put("ber", "zgh") 115 .put("sdv", "saq") 116 .put("sw", "swc") 117 .put("alv", "agq") 118 .put("bnt", "asa") 119 .put("bnt", "bez") 120 .put("bnt", "cgg") 121 .put("bnt", "ebu") 122 .put("bnt", "ksb") 123 .put("bnt", "lag") 124 .put("bnt", "rof") 125 .put("bnt", "sbp") 126 .put("ngb", "sg") 127 .put("alv", "ngb") 128 .put("bnt", "jmc") 129 .put("bnt", "mer") 130 .put("bnt", "mgh") 131 .put("bnt", "nmg") 132 .put("bnt", "rwk") 133 .put("bnt", "seh") 134 .put("bnt", "vun") 135 .put("bnt", "xog") 136 .put("alv", "yav") 137 .put("son", "khq") 138 .put("euq", "eu") 139 .put("mul", "euq") 140 .put("mul", "jpx") 141 .put("jpx", "ja") 142 .put("ira", "lrc") 143 .put("grk", "el") 144 .put("grk", "grc") 145 .put("grk", "gmy") 146 .build(); 147 148 static final Multimap<String, String> REMOVE_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder() 149 .put("mul", "und") // anomaly 150 .put("mul", "crp") 151 .put("crp", "*") // general Creole group interferes with French/Spanish/... language grouping 152 .put("sit", "zh") // other cases where we have to remove items we add in different place above. 153 .put("inc", "rmg") 154 .put("sla", "cu") 155 .put("ine", "gmy") 156 .put("ine", "el") 157 .put("ine", "grc") 158 .build(); 159 main(String[] args)160 public static void main(String[] args) { 161 if (true) { 162 // check on items 163 for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) { 164 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]"); 165 Collection<String> entities = codeToEntity.get(check); 166 if (entities.isEmpty()) { 167 System.out.println("no code for " + check + ": " + entities); 168 continue; 169 } 170 for (String entity : entities) { 171 Set<List<String>> ancestors = getAllAncestors(entity); 172 showEntityLists(entity + " parents ", ancestors); 173 System.out.println(); 174 } 175 } 176 } 177 178 Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language); 179 TreeMultimap<String, String> _parentToChild = TreeMultimap.create(); 180 TreeSet<String> missing = new TreeSet<>(table.get(Status.regular)); 181 _parentToChild.put("mul", "und"); 182 for (String code : table.get(Status.regular)) { 183 if (ONLY_LIVING) { 184 Type type = Iso639Data.getType(code); 185 if (type != Type.Living) { 186 continue; 187 } 188 } 189 if (code.compareTo("hdz") > 0) { 190 int debug = 0; 191 } 192 // if (COLLECTIONS.contains(code)) { 193 // continue; 194 // } 195 Collection<String> entities = codeToEntity.get(code); 196 if (entities.isEmpty()) { 197 continue; 198 } 199 for (String entity : entities) { 200 if (childToParent.get(entity).isEmpty()) { 201 continue; 202 } 203 Set<Set<String>> chains = getAncestors(entity); 204 if (chains.size() > 1) { 205 int debug = 0; 206 } 207 for (Set<String> chain : chains) { 208 String last = null; 209 for (String link : chain) { 210 if (last != null) { 211 _parentToChild.put(link, last); 212 } 213 last = link; 214 } 215 } 216 } 217 } 218 219 for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) { 220 String key = entity.getKey(); 221 for (String value : entity.getValue()) { 222 if (value.equals("*")) { 223 _parentToChild.removeAll(key); 224 } else { 225 _parentToChild.remove(key, value); 226 } 227 } 228 } 229 230 _parentToChild.putAll(EXTRA_PARENT_CHILDREN); 231 232 // special code for artificial 233 for (String code : Iso639Data.getAvailable()) { 234 Type type = Iso639Data.getType(code); 235 if (type == Type.Constructed) { 236 _parentToChild.put("art", code); 237 } 238 } 239 240 Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild); 241 Multimap<String, String> childToParent = ImmutableMultimap.copyOf(Multimaps.invertFrom(parentToChild, TreeMultimap.create())); 242 System.out.println("Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he")); 243 244 PrintWriter out = new PrintWriter(System.out); 245 print(out, parentToChild, new ArrayList<>(Arrays.asList("mul"))); 246 System.out.println(out); 247 SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup"); 248 xmlSource.setNonInheriting(true); // should be gotten from DtdType... 249 CLDRFile newFile = new CLDRFile(xmlSource); 250 newFile.setDtdType(DtdType.supplementalData); 251 newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", ""); 252 printXML(newFile, parentToChild); 253 254 try (PrintWriter outFile = FileUtilities.openUTF8Writer(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) { 255 newFile.write(outFile); 256 } catch (IOException e1) { 257 throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1); 258 } 259 260 // for (Entry<String,String> entry : childToParent.entries()) { 261 // String childNames = getName(entityToCode, entityToLabel, entry.getKey()); 262 // String parentNames = getName(entityToCode, entityToLabel, entry.getValue()); 263 // System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" + childNames + "\t" + parentNames); 264 // } 265 } 266 showEntityLists(String title, Set<List<String>> ancestors)267 private static void showEntityLists(String title, Set<List<String>> ancestors) { 268 ancestors.forEach(new Consumer<List<String>>() { 269 @Override 270 public void accept(List<String> item) { 271 item.forEach(new Consumer<String>() { 272 @Override 273 public void accept(String t) { 274 System.out.println(t + "\t" + entityToCode.get(t) + "\t" + entityToLabel.get(t)); 275 } 276 }); 277 System.out.println(); 278 } 279 }); 280 } 281 printXML(CLDRFile newFile, Multimap<String, String> parentToChild)282 private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) { 283 printXML(newFile, parentToChild, "mul"); 284 } 285 printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base)286 private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base) { 287 Collection<String> children = parentToChild.get(base); 288 if (children.isEmpty()) { 289 return; 290 } 291 if (base.equals("und")) { 292 // skip, no good info 293 } else { 294 newFile.add("//" + DtdType.supplementalData + "/languageGroups/languageGroup[@parent=\"" + base + "\"]", 295 Joiner.on(" ").join(children)); 296 } 297 for (String child : children) { 298 printXML(newFile, parentToChild, child); 299 } 300 } 301 print(Writer out, Multimap<String, String> parentToChild, List<String> line)302 private static void print(Writer out, Multimap<String, String> parentToChild, List<String> line) { 303 String current = line.get(line.size() - 1); 304 Collection<String> children = parentToChild.get(current); 305 if (children.isEmpty()) { 306 try { 307 String sep = ""; 308 for (String item : line) { 309 out.append(sep).append(NAME.apply(item)); 310 sep = " > "; 311 } 312 out.append('\n'); 313 out.flush(); 314 } catch (IOException e) { 315 } 316 } else { 317 for (String child : children) { 318 line.add(child); 319 print(out, parentToChild, line); 320 line.remove(line.size() - 1); 321 } 322 } 323 } 324 getAncestors(String leaf)325 private static Set<Set<String>> getAncestors(String leaf) { 326 Set<List<String>> items = Containment.getAllDirected(childToParent, leaf); 327 Set<Set<String>> itemsFixed = new LinkedHashSet<>(); 328 main: for (List<String> item : items) { 329 Set<String> chain = new LinkedHashSet<>(); 330 for (String id : item) { 331 String code = entityToCode.get(id); 332 if (code == null) { 333 continue; 334 } 335 336 // skip leaf nodes after the first 337 338 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) { 339 if (code.equals("zh")) { 340 code = "zhx"; // rewrite collections usage 341 } else { 342 log("Skipping inheritance from\t" + chain + "\t" + code + "\tfrom\t" + items); 343 continue; 344 } 345 } 346 347 // check for cycle, and skip if we have one 348 349 boolean changed = chain.add(code); 350 if (!changed) { 351 log("Cycle in\t" + chain + "\tfrom\t" + items); 352 continue main; 353 } 354 } 355 if (chain.size() > 1) { 356 chain.add("mul"); // root 357 itemsFixed.add(chain); 358 } 359 } 360 // remove subsets 361 // eg [[smp, he, mul], [smp, he, sem, afa, mul]] 362 // => [[smp, he, sem, afa, mul]] 363 if (itemsFixed.size() > 1) { 364 Set<Set<String>> removals = new HashSet<>(); 365 for (Set<String> chain1 : itemsFixed) { 366 for (Set<String> chain2 : itemsFixed) { 367 if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) { 368 removals.add(chain2); 369 } 370 } 371 } 372 itemsFixed.removeAll(removals); 373 } 374 return itemsFixed; 375 // while (true) { 376 // String code = entityToCode.get(leaf); 377 // if (code != null) { 378 // chain.add(code); 379 // } 380 // Collection<String> parents = childToParent.get(leaf); 381 // if (parents.isEmpty()) { 382 // // clean up duplicates 383 // chain = new ArrayList<>(new LinkedHashSet<>(chain)); 384 // // wikipedia has non-collections as parents. Remove those if they are not first. 385 // break; 386 // } 387 // leaf = getBest(parents); 388 // } 389 // String last = chain.get(0); 390 // for (int i = 1; i < chain.size(); ++i) { 391 // String item = chain.get(i); 392 // if (!COLLECTIONS.contains(item)) { 393 // chain.set(i, item.equals("zh") ? "zhx" : ""); 394 // DROPPED_PARENTS_TO_CHILDREN.put(item, last); 395 // } else { 396 // last = item; 397 // } 398 // } 399 // chain.removeIf(x -> x.isEmpty()); 400 // if ("zh".equals(chain.get(0))) { 401 // chain.add(1,"zhx"); 402 // } 403 // last = chain.get(chain.size()-1); 404 // if (!"mul".equals(last)) { 405 // chain.add("mul"); // make sure we have root. 406 // } 407 // if (chain.size() == 2) { 408 // chain.add(1,"und"); 409 // } 410 // return chain; 411 } 412 log(String string)413 private static void log(String string) { 414 System.out.println(string); 415 // for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) { 416 // System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue()) 417 // ); 418 // } 419 } 420 getBest(Collection<String> parents)421 private static String getBest(Collection<String> parents) { 422 for (String parent : parents) { 423 String code = entityToCode.get(parent); 424 if (code == null) continue; 425 Type type = Iso639Data.getType(code); 426 if (type != Type.Living) { 427 continue; 428 } 429 return parent; 430 } 431 // failed 432 return parents.iterator().next(); 433 } 434 getEntityName(String key)435 private static String getEntityName(String key) { 436 String code = entityToCode.get(key); 437 if (code != null) { 438 try { 439 String name = NAME.apply(code); 440 if (name != null) { 441 return name; 442 } 443 } catch (Exception e) { 444 } 445 } 446 String name = entityToLabel.get(key); 447 if (name != null) { 448 return name; 449 } 450 int last = key.lastIndexOf('/'); 451 return key.substring(last + 1, key.length() - 1); 452 } 453 loadTsvPairs(Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)454 private static Multimap<String, String> loadTsvPairs(Class<?> class1, String file, 455 Function<String, String> keyMapper, Function<String, String> valueMapper) { 456 String rel = FileUtilities.getRelativeFileName(class1, file); 457 System.out.println(rel); 458 ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder(); 459 for (String line : FileUtilities.in(class1, file)) { 460 if (line.startsWith("?") || line.isEmpty()) continue; 461 List<String> parts = TAB.splitToList(line); 462 String key = parts.get(0); 463 String value = parts.get(1); 464 _keyToValues.put(key, value); 465 } 466 ImmutableMultimap<String, String> result = _keyToValues.build(); 467 showDups(file, result, keyMapper, valueMapper); 468 return result; 469 } 470 loadTsvPairsUnique(Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)471 private static Map<String, String> loadTsvPairsUnique(Class<?> class1, String file, 472 Function<String, String> fixValue, 473 Function<String, String> keyMapper, Function<String, String> valueMapper) { 474 String rel = FileUtilities.getRelativeFileName(class1, file); 475 System.out.println(rel); 476 Map<String, String> _keyToValue = new TreeMap<>(); 477 Multimap<String, String> _keyToValues = TreeMultimap.create(); 478 for (String line : FileUtilities.in(class1, file)) { 479 if (line.startsWith("?") || line.isEmpty()) continue; 480 List<String> parts = TAB.splitToList(line); 481 String key = parts.get(0); 482 String value = parts.get(1); 483 if (fixValue != null) { 484 value = fixValue.apply(value); 485 } 486 _keyToValues.put(key, value); 487 String oldValue = _keyToValue.get(key); 488 if (oldValue == null || oldValue.equals("kxm")) { 489 _keyToValue.put(key, value); 490 } 491 } 492 _keyToValue = ImmutableMap.copyOf(_keyToValue); 493 showDups(file, _keyToValues, keyMapper, valueMapper); 494 return _keyToValue; 495 } 496 showDups(String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)497 private static void showDups(String file, Multimap<String, String> _keyToValues, 498 Function<String, String> keyMapper, Function<String, String> valueMapper) { 499 for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) { 500 Collection<String> valueSet = entry.getValue(); 501 if (valueSet.size() > 1) { 502 String key = entry.getKey(); 503 key = keyMapper == null ? key : keyMapper.apply(key); 504 if (valueMapper != null) { 505 Set<String> result = new LinkedHashSet<>(); 506 valueSet.stream().map(valueMapper).forEach(x -> result.add(x)); 507 valueSet = result; 508 } 509 log(file + "\tMultiple values: " + key + "\t" + valueSet); 510 } 511 } 512 } 513 getAllAncestors(String lang)514 static Set<List<String>> getAllAncestors(String lang) { 515 return Containment.getAllDirected(childToParent, lang); 516 } 517 } 518