1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Splitter; 5 import com.google.common.collect.ImmutableMap; 6 import com.google.common.collect.ImmutableMultimap; 7 import com.google.common.collect.ImmutableSet; 8 import com.google.common.collect.ImmutableSet.Builder; 9 import com.google.common.collect.LinkedHashMultimap; 10 import com.google.common.collect.Multimap; 11 import com.google.common.collect.Multimaps; 12 import com.google.common.collect.Sets; 13 import com.google.common.collect.Sets.SetView; 14 import com.google.common.collect.SortedSetMultimap; 15 import com.google.common.collect.TreeMultimap; 16 import com.ibm.icu.impl.Row.R2; 17 import com.ibm.icu.util.ICUUncheckedIOException; 18 import java.io.IOException; 19 import java.io.PrintWriter; 20 import java.io.Writer; 21 import java.util.ArrayList; 22 import java.util.Arrays; 23 import java.util.Collection; 24 import java.util.Collections; 25 import java.util.HashSet; 26 import java.util.LinkedHashSet; 27 import java.util.List; 28 import java.util.Map; 29 import java.util.Map.Entry; 30 import java.util.Set; 31 import java.util.TreeMap; 32 import java.util.TreeSet; 33 import java.util.function.Consumer; 34 import java.util.function.Function; 35 import org.apache.jena.query.QuerySolution; 36 import org.apache.jena.query.ResultSet; 37 import org.unicode.cldr.draft.FileUtilities; 38 import org.unicode.cldr.rdf.QueryClient; 39 import org.unicode.cldr.rdf.TsvWriter; 40 import org.unicode.cldr.util.CLDRConfig; 41 import org.unicode.cldr.util.CLDRFile; 42 import org.unicode.cldr.util.CLDRPaths; 43 import org.unicode.cldr.util.Containment; 44 import org.unicode.cldr.util.DiffLanguageGroups; 45 import org.unicode.cldr.util.DtdType; 46 import org.unicode.cldr.util.Iso639Data; 47 import org.unicode.cldr.util.Iso639Data.Type; 48 import org.unicode.cldr.util.LocaleNames; 49 import org.unicode.cldr.util.SimpleXMLSource; 50 import org.unicode.cldr.util.StandardCodes; 51 import org.unicode.cldr.util.StandardCodes.LstrField; 52 import org.unicode.cldr.util.StandardCodes.LstrType; 53 import org.unicode.cldr.util.Validity; 54 import org.unicode.cldr.util.Validity.Status; 55 56 /** 57 * This code generates language group containment based on Wikidata. For example, it finds: root > 58 * Indo-European [Other] (ine) > Germanic [Other] (gem) > West Germanic languages (gmw) > English 59 * (en) 60 * 61 * <p>To do this, it reads three tables from Wikidata, and combines them. The combination is not 62 * trivial, because wikidata offers multiple "parents" for the same language, and many of the 63 * parents do not have ISO codes. For the first problem, the software computes the possible parent 64 * chains and picks among them. For the second problem, any parents without ISO codes are skipped 65 * (after forming the chains, so the ultimate ancestors are still found). <br> 66 * A number of debugging files are written to the external directory. 67 * 68 * <p>Some failures will be exposed by running this tool. Examples: <br> 69 * <b>wikidata-entityToCode Multiple values:</b> Cebaara [Q1097512] [sef, sev]. <br> 70 * If these are not CLDR languages then they do not need to be fixed. <br> 71 * <b>wikidata-childToParent Multiple values:</b> Q118712 [Q118712] [German [de, Q18], English [en, 72 * Q186]] <br> 73 * Normally these don't need to be fixed; the generation code works around them. <br> 74 * <b>Cycle in [dng, zhx]</b> from [[http://www.wikidata.org/entity/Q33050, <br> 75 * These indicate that the Wikidata has a cycle in it. A => B => C => A. Ignore these unless the 76 * cases are worth investigating. 77 * 78 * <p>Others are exposed by running TestLanguageGroup.java <br> 79 * Error: (TestLanguageGroup.java:55) Single ancestor but not in ISOLATES: ce [Chechen] [ce] <br> 80 * Check to see if the language has a language group (in this case not, so add to 81 * TestLanguageGroup.ISOLATEs). <br> 82 * For kea [Kabuverdianu] [kea], you can add cpp as the parent, as follows. <br> 83 * <b>Missing.</b> If a child-parent relation is missing, you can add it to EXTRA_PARENT_CHILDREN so 84 * that it shows up. For example, .put("gmw", "lb") says that West Germanic is the parent of 85 * Luxembourgish. <br> 86 * <b>Extra.</b> Sometimes wikidata has conflicting or erroneous entries. Those can be fixed by 87 * adding to REMOVE_PARENT_CHILDREN. Use * to remove all children, such as .put("crp", "*") <br> 88 * Sometimes the tool fails with JsonParseExceptions, but works if you rerun. <br> 89 * Cycle in [dng, zhx] from ... Will be fixed by giving the language 'no parent' (mul) 90 * 91 * <p> 92 */ 93 public class GenerateLanguageContainment { 94 static { 95 System.out.println( 96 "See the class description for GenerateLanguageContainment.java about fixing problems."); 97 } 98 99 private static final boolean ONLY_LIVING = false; 100 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 101 private static final QueryClient queryClient = QueryClient.getInstance(); 102 103 static final Splitter TAB = Splitter.on('\t').trimResults(); 104 private static final Joiner JOIN_TAB = Joiner.on('\t'); 105 static final CLDRFile ENGLISH = CONFIG.getEnglish(); 106 static final String relDir = "../util/data/languages/"; 107 static final Map<String, R2<List<String>, String>> ALIAS_MAP = 108 CONFIG.getSupplementalDataInfo().getLocaleAliasInfo().get("language"); 109 110 /** We load the SparQL queries using this helper object, to be able to catch exceptions… */ 111 static final class QueryHelper { 112 public final Map<String, String> entityToLabel; 113 public final Map<String, String> entityToCode; 114 public final ImmutableMultimap<String, String> codeToEntity; 115 public final Multimap<String, String> childToParent; 116 QueryHelper()117 QueryHelper() { 118 try { 119 entityToLabel = 120 loadQueryPairsUnique( 121 GenerateLanguageContainment.class, 122 "wikidata-entityToLabel", 123 null, 124 null, 125 null); 126 127 entityToCode = 128 loadQueryPairsUnique( 129 GenerateLanguageContainment.class, 130 "wikidata-entityToCode", 131 code -> { 132 code = code.replace("\"", ""); 133 R2<List<String>, String> v = ALIAS_MAP.get(code); 134 String result = v == null ? code : v.get0().get(0); 135 result = result.contains("_") ? code : result; 136 return result; 137 }, 138 code -> showNameAndCode(code), 139 NAME); 140 141 codeToEntity = 142 ImmutableMultimap.copyOf( 143 Multimaps.invertFrom( 144 Multimaps.forMap(entityToCode), 145 LinkedHashMultimap.create())); 146 147 childToParent = 148 loadQueryPairs( 149 GenerateLanguageContainment.class, 150 "wikidata-childToParent", 151 code -> showNameAndCode(code), 152 code -> showNameAndCode(code)); 153 154 } catch (Throwable t) { 155 t.printStackTrace(); 156 throw new RuntimeException(t); 157 } 158 } 159 getEntityName(String key)160 String getEntityName(String key) { 161 String code = getEntityCode(key); 162 if (code != null) { 163 try { 164 String name = NAME.apply(code); 165 if (name != null) { 166 return name; 167 } 168 } catch (Exception e) { 169 // TODO: Why would NAME.apply throw? 170 // TODO: Need better handling here? 171 } 172 } 173 String name = entityToLabel.get(key); 174 if (name != null) { 175 return name; 176 } 177 return afterLastSlash(key); 178 } 179 getEntityCode(String key)180 private String getEntityCode(String key) { 181 return entityToCode == null ? null : entityToCode.get(key); 182 } 183 afterLastSlash(String key)184 private String afterLastSlash(String key) { 185 return key.substring(key.lastIndexOf('/') + 1, key.length() - 1); 186 } 187 writeTsvs()188 public void writeTsvs() throws IOException { 189 TsvWriter.writeTsv("childToParent.tsv", childToParent, "child", "parent"); 190 TsvWriter.writeTsv("entityToCode.tsv", entityToCode, "lang", "langCode"); 191 TsvWriter.writeTsv("entityToLabel.tsv", entityToLabel, "lang", "langLabel"); 192 SortedSetMultimap<String, String> childToParentWithCodes = TreeMultimap.create(); 193 for (Entry<String, String> entry : childToParent.entries()) { 194 String child = entry.getKey(); 195 String parent = entry.getValue(); 196 childToParentWithCodes.put(showNameAndCode(child), showNameAndCode(parent)); 197 } 198 TsvWriter.writeTsv( 199 "childToParentWithCodes.tsv", 200 childToParentWithCodes, 201 "childCode\tLabel", 202 "parentCode\tLabel"); 203 } 204 showNameAndCode(String qid)205 public String showNameAndCode(String qid) { 206 return getEntityName(qid) 207 + " (" 208 + (getEntityCode(qid) == null ? "" : getEntityCode(qid) + ", ") 209 + afterLastSlash(qid) 210 + ")"; 211 } 212 showNameAndCode(T qids)213 public <T extends Iterable<String>> String showNameAndCode(T qids) { 214 StringBuilder b = new StringBuilder(); 215 qids.forEach( 216 qid -> { 217 if (b.length() != 0) b.append(", "); 218 b.append(showNameAndCode(qid)); 219 }); 220 return b.toString(); 221 } 222 showNameAndCode2(U qids)223 public <T extends Iterable<String>, U extends Iterable<T>> String showNameAndCode2(U qids) { 224 StringBuilder b = new StringBuilder(); 225 qids.forEach( 226 qid -> { 227 if (b.length() != 0) b.append("; "); 228 b.append(showNameAndCode(qid)); 229 }); 230 return b.toString(); 231 } 232 } 233 234 static final QueryHelper QUERY_HELPER = new QueryHelper(); 235 236 static final Function<String, String> NAME = 237 code -> 238 code.equals(LocaleNames.MUL) 239 ? LocaleNames.ROOT 240 : ENGLISH.getName(code) + " (" + code + ")"; 241 242 static final Set<String> COLLECTIONS; 243 244 static { 245 Map<String, Map<LstrField, String>> languages = 246 StandardCodes.getEnumLstreg().get(LstrType.language); 247 Builder<String> _collections = ImmutableSet.<String>builder(); 248 for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) { 249 String scope = e.getValue().get(LstrField.Scope); 250 if (scope != null && "Collection".equalsIgnoreCase(scope)) { e.getKey()251 _collections.add(e.getKey()); 252 } 253 } 254 COLLECTIONS = _collections.build(); 255 } 256 257 static class Tree { 258 Set<String> leaves = new LinkedHashSet<>(); 259 add(List<String> chain)260 void add(List<String> chain) { 261 Collections.reverse(chain); 262 } 263 } 264 265 /** To add parent-child relations to Wikidata */ 266 static final Multimap<String, String> RESET_PARENT_CHILDREN = 267 ImmutableMultimap.<String, String>builder() 268 .put(LocaleNames.MUL, LocaleNames.UND) // anomaly 269 .put(LocaleNames.MUL, "art") // no containing language family 270 .put(LocaleNames.MUL, "euq") // no containing language family 271 .put(LocaleNames.MUL, "jpx") // no containing language family 272 .put(LocaleNames.MUL, "tai") // no containing language family 273 .put( 274 LocaleNames.MUL, 275 "ko") // no containing language family (Altaic is too controversial) 276 .put(LocaleNames.MUL, "crp") // no containing language family 277 .put(LocaleNames.MUL, "kgp") // no containing language family 278 .put("alv", "agq") 279 .put("alv", "cch") // Atlantic–Congo <= cch [Atsam] 280 .put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap] 281 .put("alv", "ken") // Atlantic–Congo <= ken [Kenyang] 282 .put("alv", "ngb") 283 .put("alv", "yav") 284 .put("ber", "zgh") 285 .put("bnt", "asa") 286 .put("bnt", "bez") 287 .put("bnt", "cgg") 288 .put("bnt", "ebu") 289 .put("bnt", "jmc") 290 .put("bnt", "ksb") 291 .put("bnt", "lag") 292 .put("bnt", "mer") 293 .put("bnt", "mgh") 294 .put("bnt", "nmg") 295 .put("bnt", "rof") 296 .put("bnt", "rwk") 297 .put("bnt", "sbp") 298 .put("bnt", "seh") 299 .put("bnt", "vun") 300 .put("bnt", "xog") 301 .put("cpp", "kea") 302 .put("euq", "eu") 303 .put("gmw", "ksh") // gmw = West Germanic 304 .put("gmw", "lb") 305 .put("gmw", "wae") 306 .put("grk", "el") 307 .put("grk", "gmy") 308 .put("grk", "grc") 309 .put("ira", "lrc") 310 .put("ira", "bgn") // Iranian <= Western Balochi 311 .put("inc", "trw") // Indo-Aryan <= Torwali 312 .put("jpx", "ja") 313 .put("ngb", "sg") 314 .put("roa", "cpf") 315 .put("roa", "cpp") 316 .put("sdv", "saq") 317 .put("son", "khq") 318 .put("sw", "swc") 319 .put("tai", "blt") // tai [Tai] <= blt [Tai Dam] 320 .put("tai", "lo") 321 .put("tai", "th") 322 .put("zlw", "szl") // West Slavic <= Silesian 323 324 // Restoring languages removed in 2024-08 wikidata 325 .put("inc", "ur") // Urdu is indic 326 .put("inc", "pa") // Punjabi is indic 327 .put("inc", "skr") // Saraiki is indic 328 .put("zls", "bs") // South Slavic (sh has problems) 329 .put("zls", "hr") // South Slavic (sh has problems) 330 .put("zls", "sr") // South Slavic (sh has problems) 331 .put("inc", "hi") // Indic 332 .put("inc", "kok") // Indic 333 .put("inc", "ks") // Indic 334 .put("inc", "mr") // Indic 335 .put("inc", "sd") // Indic 336 .put("cr", "csw") // Cree 337 .put("tai", "za") // Tai 338 .put("fiu", "hu") // Finno-Ugric 339 .put("alg", "cr") // Algonquin 340 .put("sit", "bo") // Sino-Tibetan 341 .put("poz", "mg") // Malayo-Polynesian languages 342 .put("esx", "iu") // Eskimo-Aleut languages 343 .put("esx", "kl") // Eskimo-Aleut languages 344 .build(); 345 346 /** 347 * To remove parent-child relations from Wikidata, eg if a child has two parents (where that 348 * causes problems). Don't do it if there is an explicit parent above. 349 */ 350 static final Multimap<String, String> REMOVE_PARENT_CHILDREN = 351 ImmutableMultimap.<String, String>builder() 352 .put("alv", "ukg") // ngf [Trans-New Guinea languages] <= ukg [Ukuriguma] 353 .put( 354 "crp", 355 "*") // general Creole group interferes with French/Spanish/... language 356 // grouping 357 .put("cus", "mhd") // bnt [Bantu] <= mhd [Mbugu] (not cus [Cushitic]) 358 .put("gmw", "pih") // cpe [Creoles and pidgins, English based] <= pih 359 // [Pitcairn-Norfolk] 360 .put("inc", "rmg") 361 // Indo-European 362 .put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga] 363 .put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga] 364 .put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo] 365 .put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum] 366 .put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi] 367 .put("sit", "dz") // sit <= tbq <= dz 368 .put("sit", "zh") 369 .put("sla", "cu") 370 .put("tbq", "psq") // paa [Papuan]; for psq [Pasi] - not tbq [Tibeto-Burman 371 // languages]; (There is also a variety of the Sino-Tibetan Adi 372 // language called Pasi. 373 .build(); 374 375 static { 376 // If a child is in RESET_PARENT_CHILDREN, it should not be in 377 // REMOVE_PARENT_CHILDREN 378 // That is because the RESET_PARENT_CHILDREN will cause the removal of any other 379 // parents anyway. 380 SetView<String> bad = 381 Sets.intersection( 382 Set.copyOf(RESET_PARENT_CHILDREN.values()), 383 Set.copyOf(REMOVE_PARENT_CHILDREN.values())); 384 if (!bad.isEmpty()) 385 System.err.println( 386 "Remove from REMOVE_PARENT_CHILDREN, child values: \"" 387 + Joiner.on("\",\"").join(bad) 388 + "\""); 389 } 390 main(String[] args)391 public static void main(String[] args) throws IOException { 392 new GenerateLanguageContainment().run(args); 393 if (Containment.hadErrors) { 394 System.err.println("ERROR: Containment Errors detected, see errors above."); 395 System.exit(1); 396 } 397 } 398 run(String[] args)399 void run(String[] args) throws IOException { 400 if (true) { 401 // check on items 402 for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) { 403 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]"); 404 Collection<String> entities = QUERY_HELPER.codeToEntity.get(check); 405 if (entities.isEmpty()) { 406 System.out.println("no code for " + check + ": " + entities); 407 continue; 408 } 409 for (String entity : entities) { 410 Set<List<String>> ancestors = getAllAncestors(entity); 411 showEntityLists(entity + " parents ", ancestors); 412 System.out.println(); 413 } 414 } 415 } 416 417 Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language); 418 TreeMultimap<String, String> _parentToChild = TreeMultimap.create(); 419 TreeSet<String> missing = new TreeSet<>(table.get(Status.regular)); 420 _parentToChild.put(LocaleNames.MUL, LocaleNames.UND); 421 Set<String> skipping = new LinkedHashSet<>(); 422 for (String code : table.get(Status.regular)) { 423 if (ONLY_LIVING) { 424 Type type = Iso639Data.getType(code); 425 if (type != Type.Living) { 426 continue; 427 } 428 } 429 if (code.compareTo("hdz") > 0) { 430 int debug = 0; 431 } 432 // if (COLLECTIONS.contains(code)) { 433 // continue; 434 // } 435 Collection<String> entities = QUERY_HELPER.codeToEntity.get(code); 436 if (entities.isEmpty()) { 437 continue; 438 } 439 for (String entity : entities) { 440 if (QUERY_HELPER.childToParent.get(entity).isEmpty()) { 441 continue; 442 } 443 Set<Set<String>> chains = getAncestors(entity, skipping); 444 if (chains.size() > 1) { 445 int debug = 0; 446 } 447 for (Set<String> chain : chains) { 448 String last = null; 449 for (String link : chain) { 450 if (last != null) { 451 _parentToChild.put(link, last); 452 } 453 last = link; 454 } 455 } 456 } 457 } 458 System.out.println("Writing " + "skippingCodes.tsv"); 459 try (PrintWriter w = 460 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) { 461 // TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header 462 skipping.forEach(e -> w.println(e)); 463 } 464 465 // preflight 466 DiffLanguageGroups.show("en"); 467 468 Multimap<String, String> _childToParents = 469 Multimaps.invertFrom(_parentToChild, TreeMultimap.create()); 470 471 System.out.println("\nOVERRIDE Remove parent"); 472 System.out.println("OVERRIDE\tParent\tChild\tNew Parents"); 473 for (Entry<String, String> entry : REMOVE_PARENT_CHILDREN.entries()) { 474 final String parent = entry.getKey(); 475 final String child = entry.getValue(); 476 Set<String> oldChildren = _parentToChild.get(parent); 477 String type; 478 if (child.equals("*")) { 479 if (oldChildren == null) { 480 type = "No remove"; 481 } else { 482 type = "Removing parent"; 483 _parentToChild.removeAll(parent); 484 _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create()); 485 } 486 } else { 487 if (oldChildren != null && oldChildren.contains(child)) { 488 _parentToChild.remove(parent, child); 489 _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create()); 490 type = "Removing parent"; 491 } else { 492 type = "No remove"; 493 } 494 } 495 System.out.println( 496 JOIN_TAB.join( 497 type, 498 DiffLanguageGroups.show(parent), 499 DiffLanguageGroups.show(child), 500 _childToParents.get(child))); 501 } 502 503 System.out.println("\nOVERRIDE Replace Parent"); 504 System.out.println("OVERRIDE\tParent\tChild"); 505 for (Entry<String, String> entry : RESET_PARENT_CHILDREN.entries()) { 506 final String parent = entry.getKey(); 507 final String child = entry.getValue(); 508 Set<String> oldValues = _parentToChild.get(parent); 509 Set<String> removals = new LinkedHashSet<>(); 510 511 String type; 512 if (oldValues != null && oldValues.contains(child)) { 513 type = "Redundant add"; 514 } else { 515 type = "Changing"; 516 _parentToChild.put(parent, child); 517 _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create()); 518 Collection<String> newParents = _childToParents.get(child); 519 if (newParents.size() > 1) { 520 for (String parent2 : newParents) { 521 if (!parent2.equals(parent)) { 522 _parentToChild.remove(parent2, child); 523 removals.add(parent2); 524 } 525 // rebuild 526 _childToParents = 527 Multimaps.invertFrom(_parentToChild, TreeMultimap.create()); 528 } 529 } 530 } 531 System.out.println( 532 JOIN_TAB.join( 533 type, 534 DiffLanguageGroups.show(parent), 535 DiffLanguageGroups.show(child), 536 _childToParents.get(child), 537 removals)); 538 } 539 540 // special code for artificial 541 for (String code : Iso639Data.getAvailable()) { 542 Type type = Iso639Data.getType(code); 543 if (type == Type.Constructed) { 544 _parentToChild.put("art", code); 545 } 546 } 547 548 Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild); 549 Multimap<String, String> childToParent = 550 ImmutableMultimap.copyOf( 551 Multimaps.invertFrom(parentToChild, TreeMultimap.create())); 552 System.out.println( 553 "Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he")); 554 555 try (PrintWriter w = 556 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "RawLanguageContainment.txt")) { 557 print(w, parentToChild, new ArrayList<>(Arrays.asList(LocaleNames.MUL))); 558 } 559 SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup"); 560 xmlSource.setNonInheriting(true); // should be gotten from DtdType... 561 CLDRFile newFile = new CLDRFile(xmlSource); 562 newFile.setDtdType(DtdType.supplementalData); 563 newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", ""); 564 printXML(newFile, parentToChild); 565 566 try (PrintWriter outFile = 567 FileUtilities.openUTF8Writer( 568 CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) { 569 newFile.write(outFile); 570 } catch (IOException e1) { 571 throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1); 572 } 573 574 // for (Entry<String,String> entry : childToParent.entries()) { 575 // String childNames = getName(entityToCode, entityToLabel, entry.getKey()); 576 // String parentNames = getName(entityToCode, entityToLabel, entry.getValue()); 577 // System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" + 578 // childNames + "\t" + parentNames); 579 // } 580 QUERY_HELPER.writeTsvs(); 581 DiffLanguageGroups.main(new String[] {}); 582 } 583 showEntityLists(String title, Set<List<String>> ancestors)584 private static void showEntityLists(String title, Set<List<String>> ancestors) { 585 ancestors.forEach( 586 new Consumer<List<String>>() { 587 @Override 588 public void accept(List<String> item) { 589 item.forEach( 590 new Consumer<String>() { 591 @Override 592 public void accept(String t) { 593 System.out.println( 594 t 595 + "\t" 596 + QUERY_HELPER.entityToCode.get(t) 597 + "\t" 598 + QUERY_HELPER.entityToLabel.get(t)); 599 } 600 }); 601 System.out.println(); 602 } 603 }); 604 } 605 printXML(CLDRFile newFile, Multimap<String, String> parentToChild)606 private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) { 607 printXML(newFile, parentToChild, LocaleNames.MUL); 608 } 609 printXML( CLDRFile newFile, Multimap<String, String> parentToChild, String base)610 private static void printXML( 611 CLDRFile newFile, Multimap<String, String> parentToChild, String base) { 612 Collection<String> children = parentToChild.get(base); 613 if (children.isEmpty()) { 614 return; 615 } 616 if (base.equals(LocaleNames.UND)) { 617 // skip, no good info 618 } else { 619 newFile.add( 620 "//" 621 + DtdType.supplementalData 622 + "/languageGroups/languageGroup[@parent=\"" 623 + base 624 + "\"]", 625 Joiner.on(" ").join(children)); 626 } 627 for (String child : children) { 628 printXML(newFile, parentToChild, child); 629 } 630 } 631 print( Writer out, Multimap<String, String> parentToChild, List<String> line)632 private static void print( 633 Writer out, Multimap<String, String> parentToChild, List<String> line) { 634 String current = line.get(line.size() - 1); 635 Collection<String> children = parentToChild.get(current); 636 if (children.isEmpty()) { 637 try { 638 String sep = ""; 639 for (String item : line) { 640 out.append(sep).append(NAME.apply(item)); 641 sep = " > "; 642 } 643 out.append('\n'); 644 out.flush(); 645 } catch (IOException e) { 646 } 647 } else { 648 for (String child : children) { 649 line.add(child); 650 print(out, parentToChild, line); 651 line.remove(line.size() - 1); 652 } 653 } 654 } 655 getAncestors(String leaf, Set<String> skipping)656 private static Set<Set<String>> getAncestors(String leaf, Set<String> skipping) { 657 Set<List<String>> items = Containment.getAllDirected(QUERY_HELPER.childToParent, leaf); 658 Set<Set<String>> itemsFixed = new LinkedHashSet<>(); 659 main: 660 for (List<String> item : items) { 661 Set<String> chain = new LinkedHashSet<>(); 662 for (String id : item) { 663 String code = QUERY_HELPER.entityToCode.get(id); 664 if (code == null) { 665 continue; 666 } 667 668 // skip leaf nodes after the first 669 670 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) { 671 if (code.equals("zh")) { 672 code = "zhx"; // rewrite collections usage 673 } else { 674 skipping.add( 675 "Skipping inheritance from\t" 676 + chain 677 + "\t" 678 + code 679 + "\tfrom\t" 680 + QUERY_HELPER.showNameAndCode2(items)); 681 continue; 682 } 683 } 684 685 // check for cycle, and skip if we have one 686 687 boolean changed = chain.add(code); 688 if (!changed) { 689 log("Cycle in\t" + chain + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items)); 690 continue main; 691 } 692 } 693 if (chain.size() > 1) { 694 chain.add(LocaleNames.MUL); // root 695 itemsFixed.add(chain); 696 } 697 } 698 // remove subsets 699 // eg [[smp, he, mul], [smp, he, sem, afa, mul]] 700 // => [[smp, he, sem, afa, mul]] 701 if (itemsFixed.size() > 1) { 702 Set<Set<String>> removals = new HashSet<>(); 703 for (Set<String> chain1 : itemsFixed) { 704 for (Set<String> chain2 : itemsFixed) { 705 if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) { 706 removals.add(chain2); 707 } 708 } 709 } 710 itemsFixed.removeAll(removals); 711 } 712 return itemsFixed; 713 // TODO: delete this commented-out code? 714 // while (true) { 715 // String code = entityToCode.get(leaf); 716 // if (code != null) { 717 // chain.add(code); 718 // } 719 // Collection<String> parents = childToParent.get(leaf); 720 // if (parents.isEmpty()) { 721 // // clean up duplicates 722 // chain = new ArrayList<>(new LinkedHashSet<>(chain)); 723 // // wikipedia has non-collections as parents. Remove those if they are not 724 // first. 725 // break; 726 // } 727 // leaf = getBest(parents); 728 // } 729 // String last = chain.get(0); 730 // for (int i = 1; i < chain.size(); ++i) { 731 // String item = chain.get(i); 732 // if (!COLLECTIONS.contains(item)) { 733 // chain.set(i, item.equals("zh") ? "zhx" : ""); 734 // DROPPED_PARENTS_TO_CHILDREN.put(item, last); 735 // } else { 736 // last = item; 737 // } 738 // } 739 // chain.removeIf(x -> x.isEmpty()); 740 // if ("zh".equals(chain.get(0))) { 741 // chain.add(1,"zhx"); 742 // } 743 // last = chain.get(chain.size()-1); 744 // if (!LocaleNames.MUL.equals(last)) { 745 // chain.add(LocaleNames.MUL); // make sure we have root. 746 // } 747 // if (chain.size() == 2) { 748 // chain.add(1,LocaleNames.UND); 749 // } 750 // return chain; 751 } 752 log(String string)753 private static void log(String string) { 754 System.out.println(string); 755 // for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) { 756 // System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue()) 757 // ); 758 // } 759 } 760 761 // TODO: This function is only called by other commented-out code above. 762 // private static String getBest(Collection<String> parents) { 763 // for (String parent : parents) { 764 // String code = QUERY_HELPER.entityToCode.get(parent); 765 // if (code == null) continue; 766 // Type type = Iso639Data.getType(code); 767 // if (type != Type.Living) { 768 // continue; 769 // } 770 // return parent; 771 // } 772 // // failed 773 // return parents.iterator().next(); 774 // } 775 loadQueryPairs( Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)776 private static Multimap<String, String> loadQueryPairs( 777 Class<?> class1, 778 String file, 779 Function<String, String> keyMapper, 780 Function<String, String> valueMapper) 781 throws IOException { 782 System.out.println("QUERY: " + file); 783 ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER); 784 // the query must return exactly two variables. 785 List<String> resultVars = rs.getResultVars(); 786 assertTwoVars(resultVars); 787 final String keyName = resultVars.get(0); 788 final String valueName = resultVars.get(1); 789 790 ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder(); 791 for (; rs.hasNext(); ) { 792 final QuerySolution qs = rs.next(); 793 String key = QueryClient.getStringOrNull(qs, keyName); 794 String value = QueryClient.getStringOrNull(qs, valueName); 795 _keyToValues.put(key, value); 796 } 797 ImmutableMultimap<String, String> result = _keyToValues.build(); 798 showDups(file, result, keyMapper, valueMapper); 799 System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber()); 800 return result; 801 } 802 803 /** 804 * Assuming that the SPARQL query returns exactly 2 results, treat them as Key=Value. 805 * 806 * @param class1 807 * @param file name of a sparql query, such as 'wikidata-childToParent' 808 * @param fixValue 809 * @param keyMapper 810 * @param valueMapper 811 * @return 812 * @throws IOException 813 */ loadQueryPairsUnique( Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)814 private static Map<String, String> loadQueryPairsUnique( 815 Class<?> class1, 816 String file, 817 Function<String, String> fixValue, 818 Function<String, String> keyMapper, 819 Function<String, String> valueMapper) 820 throws IOException { 821 822 System.out.println("QUERY: " + file); 823 ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER); 824 825 // the query must return exactly two variables. 826 List<String> resultVars = rs.getResultVars(); 827 assertTwoVars(resultVars); 828 final String keyName = resultVars.get(0); 829 final String valueName = resultVars.get(1); 830 831 Map<String, String> _keyToValue = new TreeMap<>(); 832 Multimap<String, String> _keyToValues = TreeMultimap.create(); 833 for (; rs.hasNext(); ) { 834 final QuerySolution qs = rs.next(); 835 String key = QueryClient.getStringOrNull(qs, keyName); 836 String value = QueryClient.getStringOrNull(qs, valueName); 837 if (fixValue != null) { 838 value = fixValue.apply(value); 839 } 840 _keyToValues.put(key, value); 841 String oldValue = _keyToValue.get(key); 842 if (oldValue == null || oldValue.equals("kxm")) { 843 _keyToValue.put(key, value); 844 } 845 } 846 _keyToValue = ImmutableMap.copyOf(_keyToValue); 847 showDups(file, _keyToValues, keyMapper, valueMapper); 848 System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber()); 849 return _keyToValue; 850 } 851 assertTwoVars(List<String> resultVars)852 private static void assertTwoVars(List<String> resultVars) { 853 if (resultVars.size() != 2) { 854 throw new IllegalArgumentException( 855 "expected 2 result vars but got " + resultVars.size() + ": " + resultVars); 856 } 857 } 858 showDups( String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)859 private static void showDups( 860 String file, 861 Multimap<String, String> _keyToValues, 862 Function<String, String> keyMapper, 863 Function<String, String> valueMapper) { 864 for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) { 865 Collection<String> valueSet = entry.getValue(); 866 if (valueSet.size() > 1) { 867 String key = entry.getKey(); 868 key = keyMapper == null ? key : keyMapper.apply(key); 869 if (valueMapper != null) { 870 Set<String> result = new LinkedHashSet<>(); 871 valueSet.stream().map(valueMapper).forEach(x -> result.add(x)); 872 valueSet = result; 873 } 874 log(file + "\tMultiple values: " + key + "\t" + valueSet); 875 } 876 } 877 } 878 getAllAncestors(String lang)879 static Set<List<String>> getAllAncestors(String lang) { 880 return Containment.getAllDirected(QUERY_HELPER.childToParent, lang); 881 } 882 } 883